remove useless comment, add several new config for multi thread

8ce9fe57 · carlushuang · b8ba0239 · 8ce9fe57 · 8ce9fe57 · 8ce9fe57
Commit 8ce9fe57 authored May 01, 2022 by carlushuang
7 changed files
--- a/include/ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/block/blockwise_gemm_avx2.hpp
@@ -213,9 +213,6 @@ struct BlockwiseGemmAvx2_MxN
                auto current_mr = ck::math::min(m_per_block - i_m, m_per_thread);
                param.p_a       = &a_block_buf.p_data_[GetABlockStartOffset(a_block_desc, i_m, 0)];

-                // printf("YYYY: %d, i_m:%d, current_mr:%d, %d, %p\n",__LINE__, i_m, current_mr,
-                // GetABlockStartOffset(a_block_desc, i_m, 0), param.p_a);fflush(stdout);
-
                for(ck::index_t i_n = 0; i_n < n_per_block; i_n += n_per_thread)
                {
                    auto current_nr = ck::math::min(n_per_block - i_n, n_per_thread);
@@ -223,11 +220,6 @@ struct BlockwiseGemmAvx2_MxN
                    param.p_b = &b_block_buf.p_data_[GetBBlockStartOffset(b_block_desc, 0, i_n)];
                    param.p_c = &c_buf.p_data_[GetCBlockStartOffset(c_desc, i_m, i_n)];

-                    // printf("YYYY: %d, i_n:%d, current_nr:%d, %d, %p, C:%d, %p\n",__LINE__, i_n,
-                    // current_nr, GetBBlockStartOffset(b_block_desc, 0, i_n), param.p_b,
-                    //        GetCBlockStartOffset(c_desc, i_m, i_n),
-                    //        param.p_c);fflush(stdout);
-
                    ThreadwiseGemm_Dispatch::Run(&param, current_mr, current_nr);
                }
            }

--- a/include/ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/grid/gridwise_gemm_avx2.hpp
@@ -12,6 +12,7 @@
 #include <utility>
 #include <unistd.h>
 #include <omp.h>
+#include <pthread.h>

 namespace ck {
 namespace cpu {
@@ -193,6 +194,23 @@ struct GridwiseGemmAvx2_MxN

        int total_threads = omp_get_max_threads();

+#if 0
+        if(total_threads > 1){
+#pragma omp parallel
+            {
+                int tid = omp_get_thread_num();
+                cpu_set_t set;
+                CPU_ZERO(&set);
+
+                CPU_SET(tid, &set);
+
+                if (sched_setaffinity(0, sizeof(set), &set) == -1) {
+                    throw std::runtime_error("wrong! fail to set thread affinity");
+                }
+            }
+        }
+#endif
+
        // TODO: openmp aware ordering
        //
        if constexpr(std::is_same<BlockMNKAccessOrder, ck::Sequence<0, 1, 2>>::value)
@@ -234,8 +252,9 @@ struct GridwiseGemmAvx2_MxN
                                                MemAlignmentByte);
                DeviceAlignedMemCPU b_block_mem(k_per_block * n_per_block * sizeof(FloatB),
                                                MemAlignmentByte);
-                DeviceAlignedMemCPU c_block_mem(m_per_block * n_per_block * sizeof(FloatC),
-                                                MemAlignmentByte);
+                DeviceAlignedMemCPU c_block_mem(
+                    UseCLocalBuffer ? (m_per_block * n_per_block * sizeof(FloatC)) : 0,
+                    MemAlignmentByte);

                auto a_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum::Global>(
                    reinterpret_cast<FloatA*>(a_block_mem.mpDeviceBuf),
@@ -298,26 +317,9 @@ struct GridwiseGemmAvx2_MxN
                        auto a_block_desc = GetABlockDescriptor(mc_size, kc_size);
                        auto b_block_desc = GetBBlockDescriptor(kc_size, nc_size);

-                        // printf("[tid:%d]==> i_m:%d, i_n:%d, i_k:%d, mc:%d, nc:%d, kc:%d(%d,
-                        // %d)\n", tid, i_mc,
-                        // i_nc, i_kc, mc_size, nc_size, kc_size, KPerBlock, GemmK); fflush(stdout);
-
                        a_threadwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
-
                        b_threadwise_copy.Run(b_grid_desc, b_grid_buf, b_block_desc, b_block_buf);

-                        // for(auto i_elem = 0; i_elem < (mc_size * kc_size) ; i_elem++){
-                        //    printf("A ==> %3d : %f(0x%08x)\n", i_elem,
-                        //    (reinterpret_cast<float*>(a_block_buf.p_data_))[i_elem],
-                        //    (reinterpret_cast<uint32_t*>(a_block_buf.p_data_))[i_elem]);
-                        //}
-
-                        // for(auto i_elem = 0; i_elem < (kc_size * nc_size) ; i_elem++){
-                        //     printf("B ==> %3d : %f(0x%08x)\n", i_elem,
-                        //     (reinterpret_cast<float*>(b_block_buf.p_data_))[i_elem],
-                        //     (reinterpret_cast<uint32_t*>(b_block_buf.p_data_))[i_elem]);
-                        // }
-                        // printf("[%d] 2222 \n",__LINE__);
                        blockwise_gemm.Run(a_block_desc,
                                           a_block_buf,
                                           make_zero_multi_index<a_block_copy_dim>(),
@@ -329,28 +331,13 @@ struct GridwiseGemmAvx2_MxN
                                           make_zero_multi_index<2>(),
                                           i_kc != 0);

-                        // printf("[%d] 2222 \n",__LINE__);
                        if((i_kc + k_per_block) < GemmK)
                        {
                            a_threadwise_copy.MoveSrcSliceWindow(a_grid_desc, a_move_k_step);
                            b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc, b_move_k_step);
                        }
-
-                        // printf("[%d] 2222 \n",__LINE__);
-
-                        // for(auto i_elem = 0; i_elem < (10) ; i_elem++){
-                        //     printf("C ==> %3d : %f(0x%08x)\n", i_elem,
-                        //     (reinterpret_cast<float*>(c_block_buf.p_data_))[i_elem],
-                        //     (reinterpret_cast<uint32_t*>(c_block_buf.p_data_))[i_elem]);
-                        // }
                    }

-                    // for(auto i_elem = 0; i_elem < (c_block_mem.mMemSize / sizeof(FloatC)) ;
-                    // i_elem++){
-                    //     printf("C ==> %3d : %f(0x%08x)\n", i_elem,
-                    //     (reinterpret_cast<float*>(c_block_buf.p_data_))[i_elem],
-                    //     (reinterpret_cast<uint32_t*>(c_block_buf.p_data_))[i_elem]);
-                    // }
                    if constexpr(UseCLocalBuffer)
                        c_threadwise_copy.Run(c_block_desc, c_block_buf, c_grid_desc, c_grid_buf);
                }
@@ -396,8 +383,9 @@ struct GridwiseGemmAvx2_MxN
                                                MemAlignmentByte);
                DeviceAlignedMemCPU b_block_mem(k_per_block * n_per_block * sizeof(FloatB),
                                                MemAlignmentByte);
-                DeviceAlignedMemCPU c_block_mem(m_per_block * n_per_block * sizeof(FloatC),
-                                                MemAlignmentByte);
+                DeviceAlignedMemCPU c_block_mem(
+                    UseCLocalBuffer ? (m_per_block * n_per_block * sizeof(FloatC)) : 0,
+                    MemAlignmentByte);

                auto a_block_buf = ck::cpu::make_dynamic_buffer<ck::AddressSpaceEnum::Global>(
                    reinterpret_cast<FloatA*>(a_block_mem.mpDeviceBuf),

--- a/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
@@ -349,9 +349,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
            src_offset = i_n * Hi * Wi * C + i_hi * Wi * C + i_wi * C + i_c;

            i_gemm_k = idx_k;
-
-            // printf("[%d] i_wo:%d, i_ho:%d, i_wi:%d, i_hi:%d, src_offset:%d\n",
-            //            __LINE__, i_wo, i_ho, i_wi, i_hi, src_offset);
        }
    }

@@ -447,7 +444,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
                    if(i_ho_itr >= Ho)
                    {
                        i_ho_itr = 0;
-                        // i_n++;
                        p_src += input_offset_ovf_hi_acc_n;
                    }

@@ -468,26 +464,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
                    ck::index_t i_wi_itr = i_wi;
                    ck::index_t i_hi_itr = i_hi;

-                    // printf("[%d] i_m_itr:%d, i_wo_itr:%d, i_ho_itr:%d, i_wi_itr:%d, i_hi_itr:%d,
-                    // src_offset:%d, input_offset_acc_wi:%d,
-                    // input_offset_ovf_wi_acc_hi:%d,input_offset_ovf_hi_acc_n:%d, %p(%p)\n",
-                    //         __LINE__, i_m_itr, i_wo_itr, i_ho_itr, i_wi_itr, i_hi_itr,
-                    //         src_offset, input_offset_acc_wi, input_offset_ovf_wi_acc_hi,
-                    //         input_offset_ovf_hi_acc_n, src_buf.p_data_, p_src);
-
-                    // printf("%p %p %p, %d, %x, %p\n",src_buf.p_data_, reinterpret_cast<const
-                    // float*>(src_buf.p_data_) + 1, reinterpret_cast<const float*>(src_buf.p_data_)
-                    // + ck::index_t(-1),
-                    //     sizeof(src_offset), *reinterpret_cast<uint32_t*>(&src_offset),
-                    //     reinterpret_cast<const float*>(src_buf.p_data_) + (-1088));
-
                    while(i_m_itr > 0)
                    {
-                        // printf("[%d] i_m_itr:%d, i_wo_itr:%d, i_ho_itr:%d, i_wi_itr:%d,
-                        // i_hi_itr:%d, src_offset:%d -> %p\n",
-                        //    __LINE__, i_m_itr, i_wo_itr, i_ho_itr, i_wi_itr, i_hi_itr, src_offset,
-                        //    p_src);
-
                        if((*reinterpret_cast<uint32_t*>(&i_hi_itr) < Hi) &&
                           (*reinterpret_cast<uint32_t*>(&i_wi_itr) < Wi))
                            avx2_util::memcpy32_avx2(p_dst, p_src, k_per_block);
@@ -512,14 +490,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
                        {
                            i_ho_itr = 0;
                            i_hi_itr -= Ho * Sy;
-                            // i_n++;
                            p_src += input_offset_ovf_hi_acc_n;
                        }

                        i_m_itr--;
                    }
-
-                    // printf("[%d]   \n", __LINE__);
                }
                else
                {
@@ -538,8 +513,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
                        ck::index_t i_wi_itr_k = i_wi_itr;
                        ck::index_t i_hi_itr_k = i_hi_itr;
                        ck::index_t i_c_itr_k  = i_c;
-                        ck::index_t i_y_itr_k  = i_y;
-                        ck::index_t i_x_itr_k  = i_x;
+                        // ck::index_t i_y_itr_k  = i_y;
+                        ck::index_t i_x_itr_k = i_x;

                        ck::index_t i_k_itr = k_per_block;
                        while(i_k_itr > 0)
@@ -566,7 +541,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
                            if(i_x_itr_k >= Fx)
                            {
                                i_x_itr_k = 0;
-                                i_y_itr_k++;
+                                // i_y_itr_k++;
                                i_wi_itr_k -= Dx * Fx;
                                i_hi_itr_k += Dy;
                                p_src_k += input_offset_ovf_x_acc_y;
@@ -594,7 +569,6 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
                        {
                            i_ho_itr = 0;
                            i_hi_itr -= Ho * Sy;
-                            // i_n++;
                            p_src += input_offset_ovf_hi_acc_n;
                        }

@@ -626,40 +600,27 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
            if constexpr(GemmKSpecialization ==
                         ConvolutionForwardGemmKSpecialization_t::NHWC_GemmKLoopOverC)
            {
-                // c % k_per_block == 0, so every time k_per_block here is the same
-                // ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h
-                // iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w
-                // printf("222222 C:%d, src_offset:%d, i_c:%d, i_x:%d\n", C, src_offset, i_c, i_x);
-                // fflush(stdout);
-
                // TODO: branch seems weird

                i_c += move_k;
                src_offset += move_k;

-                // printf("3333[%d]  src_offset:%d\n", __LINE__, src_offset);
-
                if(i_c >= C)
                {
                    i_c = 0;
                    i_x++;
                    i_wi += Dx;
                    src_offset += Dx * C - C;
-                    // printf("3333[%d]  src_offset:%d\n", __LINE__, src_offset);
                }
                if(i_x >= Fx)
                {
                    i_x = 0;
-                    i_y++;
+                    // i_y++;
                    i_wi = i_wi - Fx * Dx;
                    i_hi += Dy;

                    src_offset += Dy * Wi * C - Fx * Dx * C;
-                    // printf("3333[%d]  src_offset:%d\n", __LINE__, src_offset);
                }
-
-                // printf("inp move:%d, i_c:%d, i_hi:%d, i_wi:%d src_offset:%d\n", move_k, i_c,
-                // i_hi, i_wi, src_offset); fflush(stdout);
            }
            else
            {

--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
@@ -28,17 +28,24 @@ DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
 DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
    : mMemSize(mem_size), mAlignment(alignment)
 {
-    assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
+    if(mem_size == 0)
+    {
+        mpDeviceBuf = nullptr;
+    }
+    else
+    {
+        assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2

-    void* p1;
-    void** p2;
-    int offset = alignment - 1 + sizeof(void*);
-    p1         = malloc(mem_size + offset);
-    assert(p1 != nullptr);
+        void* p1;
+        void** p2;
+        int offset = alignment - 1 + sizeof(void*);
+        p1         = malloc(mem_size + offset);
+        assert(p1 != nullptr);

-    p2     = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
-    p2[-1] = p1;
-    mpDeviceBuf = reinterpret_cast<void*>(p2);
+        p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
+        p2[-1]      = p1;
+        mpDeviceBuf = reinterpret_cast<void*>(p2);
+    }
 }

 void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
@@ -51,7 +58,11 @@ void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize)

 void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }

-DeviceAlignedMemCPU::~DeviceAlignedMemCPU() { free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); }
+DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
+{
+    if(mpDeviceBuf != nullptr)
+        free((reinterpret_cast<void**>(mpDeviceBuf))[-1]);
+}

 struct KernelTimerImpl
 {

--- a/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+++ b/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
@@ -55,30 +55,81 @@ static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, GemmKLoopOverC  , LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1S1P0, DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>
-
 // clang-format on
+
 using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
    // clang-format off
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 128,  64,  6, 16, true, true, false),
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 128, 128,  6, 16, true, true, false),
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  128, 256, 128,  6, 16, true, true, false),

-    // DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 144, 128,  4, 24, true, true, false),
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 240, 128,  4, 24, true, true, false),
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 256, 128,  6, 16, true, true, false),

-    // DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  768, 288, 128,  4, 24, true, true, false),
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  768, 320, 128,  6, 16, true, true, false),
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  896, 352, 128,  6, 16, true, true, false),
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 1024, 416, 128,  6, 16, true, true, false)>;
 // clang-format on

+// use this in single thread, but gemm_n is not multiple of 8
+using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_local_c_instances = std::tuple<
+    // clang-format off
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 128,  64,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 128, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  128, 256, 128,  6, 16, true, true, true),
+
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 240, 128,  4, 24, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 256, 128,  6, 16, true, true, true),
+
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  768, 320, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  896, 352, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 1024, 416, 128,  6, 16, true, true, true)>;
+// clang-format on
+
+// use this in multi thread environment (need local C buffer to avoid cache coherence, although some
+// time no local c is better...)
+using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_mt_instances = std::tuple<
+    // clang-format off
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  48,  24, 128,  4, 24, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  72,  16, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  72,  32, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  96,  32, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  96,  64, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  120, 32, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  120, 64, 128,  6, 16, true, true, true),
+
+    // DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 128,  64,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 128, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  128, 256, 128,  6, 16, true, true, true),
+
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 240, 128,  4, 24, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 256, 128,  6, 16, true, true, true),
+
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  768, 320, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  896, 352, 128,  6, 16, true, true, true),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT, 1024, 416, 128,  6, 16, true, true, true)>;
+// clang-format on
+
 void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
 {
    ck::tensor_operation::device::add_device_operation_instances(
        instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{});
 }

+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
+    std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
+{
+    ck::tensor_operation::device::add_device_operation_instances(
+        instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_local_c_instances{});
+}
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
+    std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
+{
+    ck::tensor_operation::device::add_device_operation_instances(
+        instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_mt_instances{});
+}
+
 } // namespace device_conv2d_fwd_avx2_instance
 } // namespace device
 } // namespace cpu

--- a/profiler/include/profile_conv_fwd_cpu_impl.hpp
+++ b/profiler/include/profile_conv_fwd_cpu_impl.hpp
-#pragma once
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "tensor_layout.hpp"
-#include "device_tensor.hpp"
-#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
-#include "element_wise_operation_cpu.hpp"
-#include "reference_conv_fwd.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace cpu {
-namespace device {
-namespace device_conv2d_fwd_avx2_instance {
-
-void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
-    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
-
-} // namespace device_conv2d_fwd_avx2_instance
-} // namespace device
-} // namespace cpu
-} // namespace tensor_operation
-} // namespace ck
-
-namespace ck {
-namespace profiler {
-
-#define AVX2_DATA_ALIGNMENT
-
-template <int NDimSpatial,
-          typename InDataType,
-          typename WeiDataType,
-          typename OutDataType,
-          typename InLayout,
-          typename WeiLayout,
-          typename OutLayout>
-void profile_conv_cpu_fwd_impl(int do_verification,
-                               int init_method,
-                               bool do_log,
-                               int nrepeat,
-                               ck::index_t N,
-                               ck::index_t K,
-                               ck::index_t C,
-                               std::vector<ck::index_t> input_spatial_lengths,
-                               std::vector<ck::index_t> filter_spatial_lengths,
-                               std::vector<ck::index_t> output_spatial_lengths,
-                               std::vector<ck::index_t> conv_filter_strides,
-                               std::vector<ck::index_t> conv_filter_dilations,
-                               std::vector<ck::index_t> input_left_pads,
-                               std::vector<ck::index_t> input_right_pads)
-{
-    const ck::index_t Y = filter_spatial_lengths[0];
-    const ck::index_t X = filter_spatial_lengths[1];
-
-    const ck::index_t Hi = input_spatial_lengths[0];
-    const ck::index_t Wi = input_spatial_lengths[1];
-
-    const ck::index_t Ho = output_spatial_lengths[0];
-    const ck::index_t Wo = output_spatial_lengths[1];
-
-    auto f_host_tensor_descriptor =
-        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
-            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
-                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
-                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
-            }
-            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
-                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
-                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
-            {
-                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
-            }
-        };
-
-    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
-    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_host_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-    Tensor<OutDataType> out_n_k_ho_wo_device_result(
-        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
-
-    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
-    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
-    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-
-    switch(init_method)
-    {
-    case 0: break;
-    case 1:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-        break;
-    default:
-        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
-    }
-
-    using InElementOp  = ck::tensor_operation::cpu::element_wise::PassThrough;
-    using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
-    using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
-
-    const auto in_element_op  = InElementOp{};
-    const auto wei_element_op = WeiElementOp{};
-    const auto out_element_op = OutElementOp{};
-
-    if(do_verification)
-    {
-        using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
-                                                                                      WeiDataType,
-                                                                                      OutDataType,
-                                                                                      InElementOp,
-                                                                                      WeiElementOp,
-                                                                                      OutElementOp>;
-
-        auto ref_conv     = ReferenceConvFwdInstance{};
-        auto ref_invoker  = ref_conv.MakeInvoker();
-        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
-                                                  wei_k_c_y_x,
-                                                  out_n_k_ho_wo_host_result,
-                                                  conv_filter_strides,
-                                                  conv_filter_dilations,
-                                                  input_left_pads,
-                                                  input_right_pads,
-                                                  in_element_op,
-                                                  wei_element_op,
-                                                  out_element_op);
-
-        ref_invoker.Run(ref_argument);
-    }
-
-    DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
-                                      AVX2_DATA_ALIGNMENT);
-    DeviceAlignedMemCPU wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(),
-                                       AVX2_DATA_ALIGNMENT);
-    DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
-                                           out_n_k_ho_wo_device_result.mDesc.GetElementSpace(),
-                                       AVX2_DATA_ALIGNMENT);
-
-    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-
-    memcpy(in_device_buf.mpDeviceBuf, in_n_c_hi_wi.mData.data(), in_device_buf.mMemSize);
-    memcpy(wei_device_buf.mpDeviceBuf, wei_k_c_y_x.mData.data(), wei_device_buf.mMemSize);
-
-    using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
-
-    using DeviceConvFwdNoOpPtr =
-        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
-
-    // add device Conv instances
-    std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
-
-    ck::tensor_operation::cpu::device::device_conv2d_fwd_instance::
-        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
-
-    if(conv_ptrs.size() <= 0)
-    {
-        throw std::runtime_error("wrong! no device Conv instance found");
-    }
-
-    std::string best_conv_name;
-    float best_ave_time   = 0;
-    float best_gflops     = 0;
-    float best_gb_per_sec = 0;
-
-    // profile device Conv instances
-    for(auto& conv_ptr : conv_ptrs)
-    {
-        auto argument_ptr = conv_ptr->MakeArgumentPointer(
-            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-            N,
-            K,
-            C,
-            input_spatial_lengths,
-            filter_spatial_lengths,
-            output_spatial_lengths,
-            conv_filter_strides,
-            conv_filter_dilations,
-            input_left_pads,
-            input_right_pads,
-            in_element_op,
-            wei_element_op,
-            out_element_op);
-
-        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
-
-        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
-        {
-            std::string conv_name = conv_ptr->GetTypeString();
-
-            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
-
-            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
-
-            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
-                                    sizeof(WeiDataType) * (K * C * Y * X) +
-                                    sizeof(OutDataType) * (N * K * Ho * Wo);
-
-            float gflops = static_cast<float>(flop) / 1.E6 / ave_time;
-
-            float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-            std::cout << "Perf: " << ave_time << " ms, " << gflops << " GFlops, " << gb_per_sec
-                      << " GB/s, " << conv_name << std::endl;
-
-            if(gflops > best_gflops)
-            {
-                best_conv_name  = conv_name;
-                best_gflops     = gflops;
-                best_ave_time   = ave_time;
-                best_gb_per_sec = gb_per_sec;
-            }
-
-            if(do_verification)
-            {
-                memcpy(out_n_k_ho_wo_device_result.mData.data(),
-                       out_device_buf.mpDeviceBuf,
-                       out_device_buf.mMemSize);
-
-                check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
-
-                if(do_log)
-                {
-                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
-                        << std::endl;
-                    LogRangeAsType<float>(
-                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
-                        << std::endl;
-                }
-            }
-        }
-    }
-
-    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gflops << " GFlops, "
-              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
-}
-
-} // namespace profiler
-} // namespace ck
+#pragma once
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation_cpu.hpp"
+#include "reference_conv_fwd.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace cpu {
+namespace device {
+namespace device_conv2d_fwd_avx2_instance {
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+} // namespace device_conv2d_fwd_avx2_instance
+} // namespace device
+} // namespace cpu
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+#define AVX2_DATA_ALIGNMENT
+
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_cpu_fwd_impl(int do_verification,
+                               int init_method,
+                               bool do_log,
+                               int nrepeat,
+                               ck::index_t N,
+                               ck::index_t K,
+                               ck::index_t C,
+                               std::vector<ck::index_t> input_spatial_lengths,
+                               std::vector<ck::index_t> filter_spatial_lengths,
+                               std::vector<ck::index_t> output_spatial_lengths,
+                               std::vector<ck::index_t> conv_filter_strides,
+                               std::vector<ck::index_t> conv_filter_dilations,
+                               std::vector<ck::index_t> input_left_pads,
+                               std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_host_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_device_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+
+    using InElementOp  = ck::tensor_operation::cpu::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(do_verification)
+    {
+        using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
+                                                                                      WeiDataType,
+                                                                                      OutDataType,
+                                                                                      InElementOp,
+                                                                                      WeiElementOp,
+                                                                                      OutElementOp>;
+
+        auto ref_conv     = ReferenceConvFwdInstance{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo_host_result,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    DeviceAlignedMemCPU in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace(),
+                                      AVX2_DATA_ALIGNMENT);
+    DeviceAlignedMemCPU wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace(),
+                                       AVX2_DATA_ALIGNMENT);
+    DeviceAlignedMemCPU out_device_buf(sizeof(OutDataType) *
+                                           out_n_k_ho_wo_device_result.mDesc.GetElementSpace(),
+                                       AVX2_DATA_ALIGNMENT);
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+
+    memcpy(in_device_buf.mpDeviceBuf, in_n_c_hi_wi.mData.data(), in_device_buf.mMemSize);
+    memcpy(wei_device_buf.mpDeviceBuf, wei_k_c_y_x.mData.data(), wei_device_buf.mMemSize);
+
+    using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
+
+    using DeviceConvFwdNoOpPtr =
+        ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
+
+    // add device Conv instances
+    std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
+
+    ck::tensor_operation::cpu::device::device_conv2d_fwd_instance::
+        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+
+    if(conv_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_gflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+
+        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr->GetTypeString();
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+            std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                                    sizeof(WeiDataType) * (K * C * Y * X) +
+                                    sizeof(OutDataType) * (N * K * Ho * Wo);
+
+            float gflops = static_cast<float>(flop) / 1.E6 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << gflops << " GFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+
+            if(gflops > best_gflops)
+            {
+                best_conv_name  = conv_name;
+                best_gflops     = gflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                memcpy(out_n_k_ho_wo_device_result.mData.data(),
+                       out_device_buf.mpDeviceBuf,
+                       out_device_buf.mMemSize);
+
+                check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gflops << " GFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+
+} // namespace profiler
+} // namespace ck
--- a/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
+++ b/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
@@ -26,6 +26,12 @@ using PassThrough = ck::tensor_operation::cpu::element_wise::PassThrough;
 void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(
    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);

+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(
+    std::vector<DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>>& instances);
+
 } // namespace device_conv2d_fwd_avx2_instance
 } // namespace device
 } // namespace cpu
@@ -300,8 +306,22 @@ int main(int argc, char* argv[])
                     ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
                     ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
        {
-            ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
-                add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+            if(omp_get_max_threads() > 1)
+            {
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_mt(conv_ptrs);
+                ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                    add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+            }
+            else
+            {
+                if(K % 8 == 0)
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(conv_ptrs);
+                else
+                    ck::tensor_operation::cpu::device::device_conv2d_fwd_avx2_instance::
+                        add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_local_c(conv_ptrs);
+            }
        }

        if(conv_ptrs.size() <= 0)