Add instances, tests, profiler, example

561e787e · Bartlomiej Kocot · 9f008852 · 561e787e · 561e787e · 561e787e
Commit 561e787e authored Aug 25, 2023 by Bartlomiej Kocot
13 changed files
--- a/example/52_image_to_column/image_to_column_f32.cpp
+++ b/example/52_image_to_column/image_to_column_f32.cpp
@@ -10,11 +10,11 @@ using InLayout = ck::tensor_layout::convolution::GNHWC;

 // clang-format off
 using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
-// ######| NDimSpatial|   ALayout|   BLayout|   DsLayout|  ELayout|       AData|       BData|     AccData|         CShuffle|       DsData|      EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
-// ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|         Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
-// ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
-// ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
-         < NDimSpatial, InLayout, InDataType, OutDataType, 256, 128, 128, S<128, 128>, S<16, 16>, 4>;
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+                              < NDimSpatial, InLayout, InDataType, OutDataType,   256,   128,   128, S<16, 16>,     1>;
 // clang-format on

 bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params)

--- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/stream_utility.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InputGridDesc,
+          typename InputDataType,
+          typename OutputGridDesc,
+          typename OutputDataType,
+          typename Block2ETileMap,
+          typename GridwiseImageToColumnKernel>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_image_to_column(const InputGridDesc in_grid_desc,
+                               const InputDataType* __restrict__ p_in_global,
+                               const OutputGridDesc out_grid_desc,
+                               OutputDataType* __restrict__ p_out_global,
+                               const Block2ETileMap block_2_tile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) ||             \
+    defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \
+    defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__))
+    GridwiseImageToColumnKernel::Run(
+        in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map);
+#else
+    ignore = in_grid_desc;
+    ignore = p_in_global;
+    ignore = out_grid_desc;
+    ignore = p_out_global;
+    ignore = block_2_tile_map;
+#endif
+}
+
+// Image to column for input layout NDHWC:
+//   input : input image [N, Di, Hi, Wi, C],
+//   output : output image [N * Do * Ho * Wo, Z * Y * X * C]
+template <index_t NDimSpatial,
+          typename InputLayout,
+          typename InputDataType,
+          typename OutputDataType,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t KPerBlock,
+          typename ThreadClusterLengths,
+          index_t ScalarPerVector>
+struct DeviceImageToColumnImpl
+    : public DeviceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>
+{
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvFwdToGemm<NDimSpatial, ConvolutionForwardSpecialization::Default>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpecialization::MKPadding, index_t, index_t, index_t>{
+            MPerBlock, 0 /* NPerBlock*/, KPerBlock};
+
+    // Use MakeADescriptor_M_K from grouped convolution forward
+    static auto
+    MakeInputDescriptor_M_K(const ck::index_t N,
+                            const ck::index_t C,
+                            const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                            const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                            const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                            const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                            const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                            const std::array<index_t, NDimSpatial>& input_left_pads,
+                            const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{1};
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{1};
+        std::array<index_t, NDimSpatial + 3> c_g_n_k_wos_lengths{1};
+
+        auto copy = [](const auto& x, auto& y, index_t dst_offset) {
+            std::copy(x.begin(), x.end(), y.begin() + dst_offset);
+        };
+
+        constexpr index_t spatial_offset = 3;
+
+        copy(input_spatial_lengths, a_g_n_c_wis_lengths, spatial_offset);
+        copy(filter_spatial_lengths, b_g_k_c_xs_lengths, spatial_offset);
+        copy(output_spatial_lengths, c_g_n_k_wos_lengths, spatial_offset);
+
+        // fill only significant values (C and N)
+        a_g_n_c_wis_lengths[I1] = N;
+        a_g_n_c_wis_lengths[I2] = C;
+        b_g_k_c_xs_lengths[I2]  = C;
+        c_g_n_k_wos_lengths[I1] = N;
+
+        const auto in_gemmmraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<InputLayout>(
+                a_g_n_c_wis_lengths,
+                input_g_n_c_wis_strides,
+                b_g_k_c_xs_lengths,
+                {}, // not needed for A Descriptor
+                c_g_n_k_wos_lengths,
+                {}, // not needed for A Descriptor
+                conv_filter_strides,
+                conv_filter_dilations,
+                input_left_pads,
+                input_right_pads);
+
+        const auto in_gemmm_gemmk_desc =
+            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
+        return in_gemmm_gemmk_desc;
+    }
+
+    static auto
+    MakeOutDescriptor_M_K(const ck::index_t N,
+                          const ck::index_t C,
+                          const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                          const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                          const std::array<index_t, 2>& output_m_k_strides)
+    {
+        const index_t NDoHoWo =
+            N * ck::accumulate_n<index_t>(
+                    output_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
+        const index_t CZYX =
+            C * ck::accumulate_n<index_t>(
+                    filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>());
+        const auto desc_mraw_kraw = make_naive_tensor_descriptor(
+            make_tuple(NDoHoWo, CZYX), make_tuple(output_m_k_strides[I0], output_m_k_strides[I1]));
+
+        const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw);
+        return desc_m_k;
+    }
+
+    using InputGridDesc =
+        remove_cvref_t<decltype(MakeInputDescriptor_M_K(1, 1, {}, {}, {}, {}, {}, {}, {}, {}))>;
+    using OutputGridDesc = remove_cvref_t<decltype(MakeOutDescriptor_M_K(1, 1, {}, {}, {}))>;
+
+    using Block2ETileMap = remove_cvref_t<decltype(
+        BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(OutputGridDesc{}))>;
+
+    using GridwiseImageToColumnKernel = GridwiseImageToColumn<InputGridDesc,
+                                                              InputDataType,
+                                                              OutputGridDesc,
+                                                              OutputDataType,
+                                                              BlockSize,
+                                                              MPerBlock,
+                                                              KPerBlock,
+                                                              ThreadClusterLengths,
+                                                              ScalarPerVector,
+                                                              Block2ETileMap>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_in, // input image
+                 void* p_out,      // output image
+                 const ck::index_t N,
+                 const ck::index_t C,
+                 const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                 const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                 const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                 const std::array<index_t, 2>& output_m_k_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads)
+            : C_(C),
+              X_(filter_spatial_lengths[NDimSpatial - I1]),
+              p_in_{static_cast<const InputDataType*>(p_in)},
+              p_out_{static_cast<OutputDataType*>(p_out)},
+              input_g_n_c_wis_strides_{input_g_n_c_wis_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+
+            in_grid_desc_m_k_ = MakeInputDescriptor_M_K(N,
+                                                        C,
+                                                        input_spatial_lengths,
+                                                        filter_spatial_lengths,
+                                                        output_spatial_lengths,
+                                                        input_g_n_c_wis_strides,
+
+                                                        conv_filter_strides,
+                                                        conv_filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads);
+
+            out_grid_desc_m_k_ = MakeOutDescriptor_M_K(
+                N, C, filter_spatial_lengths, output_spatial_lengths, output_m_k_strides);
+        }
+
+        void Print() const
+        {
+            std::cout << in_grid_desc_m_k_ << std::endl;
+            std::cout << out_grid_desc_m_k_ << std::endl;
+        }
+
+        const ck::index_t C_;
+        const ck::index_t X_;
+
+        const InputDataType* p_in_;
+        OutputDataType* p_out_;
+
+        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides_;
+        const std::array<index_t, NDimSpatial>& conv_filter_strides_;
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations_;
+        const std::array<index_t, NDimSpatial>& input_left_pads_;
+        const std::array<index_t, NDimSpatial>& input_right_pads_;
+
+        InputGridDesc in_grid_desc_m_k_;
+        OutputGridDesc out_grid_desc_m_k_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            const auto block_2_tile_map =
+                BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, KPerBlock, OutputGridDesc>(
+                    arg.out_grid_desc_m_k_, I1 /*M01*/);
+            const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_);
+            const auto kernel       = kernel_image_to_column<InputGridDesc,
+                                                       InputDataType,
+                                                       OutputGridDesc,
+                                                       OutputDataType,
+                                                       Block2ETileMap,
+                                                       GridwiseImageToColumnKernel>;
+
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(grid_size),
+                                                        dim3(BlockSize),
+                                                        0,
+                                                        arg.in_grid_desc_m_k_,
+                                                        arg.p_in_,
+                                                        arg.out_grid_desc_m_k_,
+                                                        arg.p_out_,
+                                                        block_2_tile_map);
+            return elapsed_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const Argument& arg)
+    {
+        using namespace tensor_layout::convolution;
+        if(!(std::is_same_v<InputLayout, GNWC> || std::is_same_v<InputLayout, GNHWC> ||
+             std::is_same_v<InputLayout, GNDHWC>))
+        {
+            return false;
+        }
+        if(!(NDimSpatial >= 1 && NDimSpatial <= 3))
+        {
+            return false;
+        }
+
+        const auto x_pad_left  = arg.input_left_pads_[NDimSpatial - I1];
+        const auto x_pad_right = arg.input_right_pads_[NDimSpatial - I1];
+        const auto dilation_x  = arg.conv_filter_dilations_[NDimSpatial - I1];
+        const auto stride_x    = arg.conv_filter_strides_[NDimSpatial - I1];
+        bool is_c_packed       = arg.input_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_;
+
+        // check vector access of filter window row (only C if C is not packed)
+        if(!is_c_packed && arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of filter window row (X * C)
+        if(arg.X_ * arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of pads (x_pad_left/x_pad_right * C)
+        if(x_pad_left * arg.C_ % ScalarPerVector != 0 ||
+           x_pad_right * arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of with stride and pad
+        if((x_pad_left != 0 || x_pad_right != 0) && stride_x > 1 && arg.C_ % ScalarPerVector != 0)
+            return false;
+        // check vector access of with dilation
+        if(dilation_x > 1 && arg.C_ % ScalarPerVector != 0)
+            return false;
+
+        return GridwiseImageToColumnKernel::CheckValidity(arg.in_grid_desc_m_k_,
+                                                          arg.out_grid_desc_m_k_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_in, // input image
+                             void* p_out,      // output image
+                             const ck::index_t N,
+                             const ck::index_t C,
+                             const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                             const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                             const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                             const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                             const std::array<index_t, 2>& output_m_k_strides,
+                             const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                             const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                             const std::array<index_t, NDimSpatial>& input_left_pads,
+                             const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        return Argument{static_cast<const InputDataType*>(p_in),
+                        static_cast<OutputDataType*>(p_out),
+                        N,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        input_g_n_c_wis_strides,
+                        output_m_k_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in, // input image
+                        void* p_out,      // output image
+                        const ck::index_t N,
+                        const ck::index_t C,
+                        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+                        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& input_g_n_c_wis_strides,
+                        const std::array<index_t, 2>& output_m_k_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads) override
+    {
+        return std::make_unique<Argument>(static_cast<const InputDataType*>(p_in),
+                                          static_cast<OutputDataType*>(p_out),
+                                          N,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          input_g_n_c_wis_strides,
+                                          output_m_k_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceImageToColumn"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << KPerBlock << ", "
+            << ScalarPerVector
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp
@@ -24,7 +24,6 @@ template <typename InputGridDesc,
          index_t BlockSize,
          index_t MPerBlock,
          index_t KPerBlock,
-          typename SliceLengths,
          typename ThreadClusterLengths,
          index_t ScalarPerVector,
          typename Block2ETileMap>
@@ -65,7 +64,7 @@ struct GridwiseImageToColumn
            decltype(tie(out_grid_desc)),
            tensor_operation::element_wise::PassThrough,
            Sequence<static_cast<index_t>(InMemoryDataOperationEnum::Set)>,
-            SliceLengths,
+            Sequence<MPerBlock, KPerBlock>,
            ThreadClusterLengths,
            Sequence<0, 1>,
            Sequence<0, 1>,
@@ -83,7 +82,6 @@ struct GridwiseImageToColumn
            tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf));
    }

-    // template <typename... TsIn, typename... TsOut>
    __host__ static constexpr bool CheckValidity(const InputGridDesc& in_grid_desc,
                                                 const OutputGridDesc& out_grid_desc)
    {

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp
@@ -116,15 +116,15 @@ struct ReferenceImageToColumn : public device::BaseOperator
                                  static_cast<ck::long_index_t>(x * arg.conv_dilations_[0]) -
                                  static_cast<ck::long_index_t>(arg.in_left_pads_[0]);

-                        if(wi >= 0 &&
-                           ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
+                        for(index_t c = 0; c < C; ++c)
                        {
-                            for(index_t c = 0; c < C; ++c)
+                            if(wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
                            {
-                                column++;
                                InDataType v_in          = arg.input_(0, n, c, wi);
                                arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
                            }
+                            column++;
                        }
                    }
                };
@@ -154,18 +154,18 @@ struct ReferenceImageToColumn : public device::BaseOperator
                                      static_cast<ck::long_index_t>(x * arg.conv_dilations_[1]) -
                                      static_cast<ck::long_index_t>(arg.in_left_pads_[1]);

-                            if(hi >= 0 &&
-                               ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[3] &&
-                               wi >= 0 &&
-                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
+                            for(index_t c = 0; c < C; ++c)
                            {
-                                for(index_t c = 0; c < C; ++c)
-                                {

+                                if(hi >= 0 &&
+                                   ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[3] &&
+                                   wi >= 0 &&
+                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
+                                {
                                    InDataType v_in          = arg.input_(0, n, c, hi, wi);
                                    arg.output_(row, column) = ck::type_convert<OutDataType>(v_in);
-                                    column++;
                                }
+                                column++;
                            }
                        }
                    }
@@ -201,20 +201,23 @@ struct ReferenceImageToColumn : public device::BaseOperator
                                    static_cast<ck::long_index_t>(wo * arg.conv_strides_[2]) +
                                    static_cast<ck::long_index_t>(x * arg.conv_dilations_[2]) -
                                    static_cast<ck::long_index_t>(arg.in_left_pads_[2]);
-                                if(di >= 0 &&
-                                   ck::type_convert<std::size_t>(di) < arg.input_.GetLengths()[3] &&
-                                   hi >= 0 &&
-                                   ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[4] &&
-                                   wi >= 0 &&
-                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[5])
+                                for(index_t c = 0; c < C; ++c)
                                {
-                                    for(index_t c = 0; c < C; ++c)
+                                    if(di >= 0 &&
+                                       ck::type_convert<std::size_t>(di) <
+                                           arg.input_.GetLengths()[3] &&
+                                       hi >= 0 &&
+                                       ck::type_convert<std::size_t>(hi) <
+                                           arg.input_.GetLengths()[4] &&
+                                       wi >= 0 &&
+                                       ck::type_convert<std::size_t>(wi) <
+                                           arg.input_.GetLengths()[5])
                                    {
                                        InDataType v_in = arg.input_(0, n, c, di, hi, wi);
                                        arg.output_(row, column) =
                                            ck::type_convert<OutDataType>(v_in);
-                                        column++;
                                    }
+                                    column++;
                                }
                            }
                        }

--- a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp
@@ -24,44 +24,94 @@ using S = ck::Sequence<Is...>;
 template <ck::index_t NDimSpatial, typename InLayout>
 using device_image_to_column_bf16_instances = std::tuple<
    // clang-format off
-        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|       Slice|    Thread| Scalar|
-        //#####################|        Dim|         |           |            |  Size| Block| Block|     Lengths|   Cluster|    Per|
-        //#####################|    Spatial|         |           |            |      |      |      |            |   Lengths| Vector|
-        //#####################|           |         |           |            |      |      |      |            |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<128, 128>, S<16, 16>,      8>
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,     8,     8,   S<8, 8>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    16,    16,   S<8, 8>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    16,    16,  S<8, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    64,    64,  S<8, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    16,    16, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
    // clang-format on
    >;

 template <ck::index_t NDimSpatial, typename InLayout>
 using device_image_to_column_f16_instances = std::tuple<
    // clang-format off
-        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|       Slice|    Thread| Scalar|
-        //#####################|        Dim|         |           |            |  Size| Block| Block|     Lengths|   Cluster|    Per|
-        //#####################|    Spatial|         |           |            |      |      |      |            |   Lengths| Vector|
-        //#####################|           |         |           |            |      |      |      |            |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<128, 128>, S<16, 16>,      8>
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,     8,     8,   S<8, 8>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    16,    16,   S<8, 8>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    16,    16,  S<8, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    64,    64,  S<8, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    16,    16, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
    // clang-format on
    >;

 template <ck::index_t NDimSpatial, typename InLayout>
 using device_image_to_column_f32_instances = std::tuple<
    // clang-format off
-        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|       Slice|    Thread| Scalar|
-        //#####################|        Dim|         |           |            |  Size| Block| Block|     Lengths|   Cluster|    Per|
-        //#####################|    Spatial|         |           |            |      |      |      |            |   Lengths| Vector|
-        //#####################|           |         |           |            |      |      |      |            |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<128, 128>, S<16, 16>,      4>
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,     8,     8,   S<8, 8>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,    16,    16,   S<8, 8>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    16,    16,  S<8, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    64,    64,  S<8, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    16,    16, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
    // clang-format on
    >;

 template <ck::index_t NDimSpatial, typename InLayout>
 using device_image_to_column_i8_instances = std::tuple<
    // clang-format off
-        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|       Slice|    Thread| Scalar|
-        //#####################|        Dim|         |           |            |  Size| Block| Block|     Lengths|   Cluster|    Per|
-        //#####################|    Spatial|         |           |            |      |      |      |            |   Lengths| Vector|
-        //#####################|           |         |           |            |      |      |      |            |          |       |
-        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   256,   256, S<256, 256>, S<16, 16>,     16>
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
+        //#####################|           |         |           |            |      |      |      |          |       |
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,     8,     8,   S<8, 8>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    16,    16,   S<8, 8>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    32,    32,   S<8, 8>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,    64,    64,    64,   S<8, 8>,     8>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    16,    16,  S<8, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    64,    64,  S<8, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    32,    64,  S<8, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   128,    64,   128,  S<8, 16>,     8>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    16,    16, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    64,    64, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     1>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,    64,    64, S<16, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,
+        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   256,   256, S<16, 16>,     16>
    // clang-format on
    >;


--- a/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt
+add_instance_library(device_image_to_column_instance
+   device_image_to_column_nhwc_1d_instance.cpp
+   device_image_to_column_nhwc_2d_instance.cpp
+   device_image_to_column_nhwc_3d_instance.cpp
+)
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -184,3 +184,42 @@ tflops: 95.337
 GB/s: 69.2301
 ```
 Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
+
+## Profile image to column kernels
+```bash
+# arg1: tensor operation (" OP_NAME ": " OP_DESC ")
+# arg2: data type (0: Input fp32, Weight fp32, Output fp32
+#                  1: Input fp16, Weight fp16, Output fp16
+#                  2: Input bf16, Weight bf16, Output bf16
+#                  3: Input int8, Weight int8, Output int8)
+# arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])
+# arg4: verification (0: no, 1: yes)
+# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
+# arg6: print tensor value (0: no; 1: yes)
+# arg7: time kernel (0: no, 1: yes)
+# Following arguments (depending on number of spatial dims):
+#  Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
+#  G, N, K, C, 
+#  <filter spatial dimensions>, (ie Y, X for 2D)
+#  <input image spatial dimensions>, (ie Hi, Wi for 2D)
+#  <strides>, (ie Sy, Sx for 2D)
+#  <dilations>, (ie Dy, Dx for 2D)
+#  <left padding>, (ie LeftPy, LeftPx for 2D)
+#  <right padding>, (ie RightPy, RightPx for 2D)
+
+ ################             op   datatype  layout  verify  init  log  time  Ndims  G   N   K   C  Y  X  Hi  Wi  Sy  Sx  Dy  Dx  LeftPy  LeftPx  RightPy  RightPx
+./bin/ckProfiler image_to_column          0       0       1     1    0     1      2  1 256   1 512  3  3   28  28   1   1   1   1        0       0       0        0
+
+ ```
+
+Result (MI250, FP32, NHWC)
+```
+input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
+output: dim 2, lengths {173056, 4608}, strides {4608, 1}
+....
+Best configuration parameters:
+name: DeviceImageToColumn<256, 64, 64, 4>
+avg_time: 3.19792
+tflops: 0
+GB/s: 1125.99
+```
--- a/profiler/include/profiler/profile_image_to_column_impl.hpp
+++ b/profiler/include/profiler/profile_image_to_column_impl.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+#include <limits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+template <index_t NDimSpatial,
+          typename InputLayout,
+          typename InputDataType,
+          typename OutputDataType>
+bool profile_image_to_column_impl(int do_verification,
+                                  int init_method,
+                                  bool do_log,
+                                  bool time_kernel,
+                                  const ck::utils::conv::ConvParam& conv_param)
+{
+    const ck::index_t NDoHoWo =
+        conv_param.N_ *
+        ck::accumulate_n<ck::index_t>(
+            conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+    const ck::index_t CZYX =
+        conv_param.C_ *
+        ck::accumulate_n<ck::index_t>(
+            conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+    const auto in_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InputLayout>(
+            conv_param);
+    const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
+    std::array<ck::index_t, 2> output_m_k_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+
+    copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
+    copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
+    copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
+    copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
+    copy(out_desc.GetStrides(), output_m_k_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    Tensor<InputDataType> input(in_desc);
+    Tensor<OutputDataType> host_output(out_desc);
+    Tensor<OutputDataType> device_output(out_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1: input.GenerateTensorValue(GeneratorTensor_2<InputDataType>{-5, 5}); break;
+    default: input.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InputDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutputDataType) * device_output.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(input.mData.data());
+
+    // run reference op
+    if(do_verification)
+    {
+        auto ref_image_to_column = ck::tensor_operation::host::
+            ReferenceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>{};
+
+        auto ref_invoker  = ref_image_to_column.MakeInvoker();
+        auto ref_argument = ref_image_to_column.MakeArgument(input,
+                                                             host_output,
+                                                             conv_param.filter_spatial_lengths_,
+                                                             conv_param.conv_filter_strides_,
+                                                             conv_param.conv_filter_dilations_,
+                                                             conv_param.input_left_pads_,
+                                                             conv_param.input_right_pads_);
+
+        // init host output to zero
+        host_output.SetZero();
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    using DeviceOp = ck::tensor_operation::device::
+        DeviceImageToColumn<NDimSpatial, InputLayout, InputDataType, OutputDataType>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    float best_avg_time = std::numeric_limits<float>::max();
+    ;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    bool pass = true;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<InputDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<OutputDataType*>(out_device_buf.GetDeviceBuffer()),
+            conv_param.N_,
+            conv_param.C_,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            input_g_n_c_wis_strides,
+            output_m_k_strides,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads);
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init output to zero before profiling next kernel
+            out_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t num_btype =
+                conv_param.GetInputByte<InputDataType>() + NDoHoWo * CZYX * sizeof(OutputDataType);
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(avg_time < best_avg_time)
+            {
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+
+                pass = pass & ck::utils::check_err(device_output, host_output);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -25,6 +25,7 @@ set(PROFILER_SOURCES
    profile_contraction_bilinear.cpp
    profile_contraction_scale.cpp
    profile_grouped_conv_bwd_data.cpp
+    profile_image_to_column.cpp
 )
 if(DL_KERNELS)
  list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
@@ -76,6 +77,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_in
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
 if(DL_KERNELS)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 endif()

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -60,6 +60,7 @@ add_subdirectory(contraction)
 add_subdirectory(pool_fwd)
 add_subdirectory(batched_gemm_multi_d)
 add_subdirectory(grouped_convnd_bwd_data)
+add_subdirectory(image_to_column)
 if(GPU_TARGETS MATCHES "gfx11")
    add_subdirectory(wmma_op)
 endif()
--- a/test/image_to_column/CMakeLists.txt
+++ b/test/image_to_column/CMakeLists.txt
+add_gtest_executable(test_image_to_column test_image_to_column.cpp)
+target_link_libraries(test_image_to_column PRIVATE utility device_image_to_column_instance)
+add_gtest_executable(test_image_to_column_interface test_image_to_column_interface.cpp)
+target_link_libraries(test_image_to_column_interface PRIVATE utility)
--- a/test/image_to_column/test_image_to_column.cpp
+++ b/test/image_to_column/test_image_to_column.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_image_to_column_impl.hpp"
+
+template <typename Tuple>
+class TestImageToColumn : public ::testing::Test
+{
+    protected:
+    using InDataType  = std::tuple_element_t<0, Tuple>;
+    using OutDataType = std::tuple_element_t<1, Tuple>;
+    using InLayout    = std::tuple_element_t<2, Tuple>;
+
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        EXPECT_FALSE(conv_params.empty());
+        bool pass = true;
+        for(auto& param : conv_params)
+        {
+            pass = pass && ck::profiler::profile_image_to_column_impl<NDimSpatial,
+                                                                      InLayout,
+                                                                      InDataType,
+                                                                      OutDataType>(
+                               true,  // do_verification
+                               1,     // init_method: integer value
+                               false, // do_log
+                               false, // time_kernel
+                               param);
+        }
+        EXPECT_TRUE(pass);
+    }
+};
+
+using namespace ck::tensor_layout::convolution;
+
+using KernelTypes1d = ::testing::Types<std::tuple<float, float, GNWC>,
+                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNWC>,
+                                       std::tuple<ck::half_t, ck::half_t, GNWC>,
+                                       std::tuple<int8_t, int8_t, GNWC>>;
+
+using KernelTypes2d = ::testing::Types<std::tuple<float, float, GNHWC>,
+                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNHWC>,
+                                       std::tuple<ck::half_t, ck::half_t, GNHWC>,
+                                       std::tuple<int8_t, int8_t, GNHWC>>;
+
+using KernelTypes3d = ::testing::Types<std::tuple<float, float, GNDHWC>,
+                                       std::tuple<ck::bhalf_t, ck::bhalf_t, GNDHWC>,
+                                       std::tuple<ck::half_t, ck::half_t, GNDHWC>,
+                                       std::tuple<int8_t, int8_t, GNDHWC>>;
+
+template <typename Tuple>
+class TestImageToColumn1d : public TestImageToColumn<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestImageToColumn2d : public TestImageToColumn<Tuple>
+{
+};
+
+template <typename Tuple>
+class TestImageToColumn3d : public TestImageToColumn<Tuple>
+{
+};
+
+TYPED_TEST_SUITE(TestImageToColumn1d, KernelTypes1d);
+TYPED_TEST_SUITE(TestImageToColumn2d, KernelTypes2d);
+TYPED_TEST_SUITE(TestImageToColumn3d, KernelTypes3d);
+
+TYPED_TEST(TestImageToColumn1d, Test1D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {2}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}});
+    // ScalarPerVector should be 1
+    this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}});
+    // stride != 1
+    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}});
+    // dilation != 1
+    this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}});
+    this->template Run<1>();
+}
+
+TYPED_TEST(TestImageToColumn2d, Test2D)
+{
+    this->conv_params.clear();
+
+    this->conv_params.push_back(
+        {2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestImageToColumn3d, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->template Run<3>();
+}
--- a/test/image_to_column/test_image_to_column_interface.cpp
+++ b/test/image_to_column/test_image_to_column_interface.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp"
+
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using DataType = float;
+using InLayout = ck::tensor_layout::convolution::GNWC;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+template <ck::index_t ScalarPerVector, bool IsCPacked>
+class TestImageToColumnInterface : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 1;
+
+    // clang-format off
+    using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl
+        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread|         Scalar|
+        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|            Per|
+        //#####################|    Spatial|         |           |            |      |      |      |   Lengths|         Vector|
+        //#####################|           |         |           |            |      |      |      |          |               |
+                              < NDimSpatial, InLayout,   DataType,    DataType,   256,   128,   128, S<16, 16>,ScalarPerVector>;
+    // clang-format on
+
+    ck::utils::conv::ConvParam conv_param;
+
+    bool Run()
+    {
+
+        const auto N = conv_param.N_;
+        const auto C = conv_param.C_;
+        const auto FakeC =
+            conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed
+
+        const ck::index_t NDoHoWo =
+            N *
+            ck::accumulate_n<ck::index_t>(
+                conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+        const ck::index_t CZYX =
+            C *
+            ck::accumulate_n<ck::index_t>(
+                conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+        const auto in_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+        const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX});
+
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> input_g_n_c_wis_strides{};
+        std::array<ck::index_t, 2> output_m_k_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+        auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
+
+        copy(conv_param.input_spatial_lengths_, input_spatial_lengths);
+        copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths);
+        copy(conv_param.output_spatial_lengths_, output_spatial_lengths);
+        copy(in_desc.GetStrides(), input_g_n_c_wis_strides);
+        copy(out_desc.GetStrides(), output_m_k_strides);
+        copy(conv_param.conv_filter_strides_, conv_filter_strides);
+        copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+        copy(conv_param.input_left_pads_, input_left_pads);
+        copy(conv_param.input_right_pads_, input_right_pads);
+
+        auto img2col  = DeviceImgToColInstance{};
+        auto argument = img2col.MakeArgument(nullptr,
+                                             nullptr,
+                                             N,
+                                             IsCPacked ? C : FakeC,
+                                             input_spatial_lengths,
+                                             filter_spatial_lengths,
+                                             output_spatial_lengths,
+                                             input_g_n_c_wis_strides,
+                                             output_m_k_strides,
+                                             conv_filter_strides,
+                                             conv_filter_dilations,
+                                             input_left_pads,
+                                             input_right_pads);
+
+        return img2col.IsSupportedArgument(argument);
+    }
+};
+
+class TestImageToColumnInterface1ScalarPerVector : public TestImageToColumnInterface<1, true>
+{
+};
+
+class TestImageToColumnInterface4ScalarPerVector : public TestImageToColumnInterface<4, true>
+{
+};
+
+class TestImageToColumnInterface4ScalarPerVectorFakeC : public TestImageToColumnInterface<4, false>
+{
+};
+
+TEST_F(TestImageToColumnInterface1ScalarPerVector, X1ScalarPerVector)
+{
+    // vector load C * X % ScalarPerVector
+    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->Run();
+    EXPECT_TRUE(is_supported);
+    // vector load C * left_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+    // vector load C * right_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+
+    // vector load C % ScalarPerVector, right_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, left_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+    // vector load C % ScalarPerVector, dilation
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+}
+
+TEST_F(TestImageToColumnInterface4ScalarPerVector, X4ScalarPerVector)
+{
+    // vector load C * X % ScalarPerVector
+    this->conv_param  = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->Run();
+    EXPECT_FALSE(is_supported);
+    // vector load C * left_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+    // vector load C * right_pad_x % ScalarPerVector
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+
+    // vector load C % ScalarPerVector, right_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, left_pad and stride
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+    // vector load C % ScalarPerVector, dilation
+    this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}};
+    is_supported     = this->Run();
+    EXPECT_FALSE(is_supported);
+
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+}
+
+TEST_F(TestImageToColumnInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC)
+{
+    // C = 3
+    this->conv_param  = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}};
+    bool is_supported = this->Run();
+    EXPECT_FALSE(is_supported);
+    // C = 4
+    this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}};
+    is_supported     = this->Run();
+    EXPECT_TRUE(is_supported);
+}