Merge remote-tracking branch 'origin/develop' into cpu_avx2

e72c0c43 · carlushuang · d714fa15 · 313bbea5 · e72c0c43 · e72c0c43
Commit e72c0c43 authored Mar 26, 2022 by carlushuang
20 changed files
--- a/profiler/src/profile_gemm_bias_relu_add.cpp
+++ b/profiler/src/profile_gemm_bias_relu_add.cpp
@@ -6,7 +6,7 @@
 #include <half.hpp>
 #include "profile_gemm_bias_relu_add_impl.hpp"

-enum GemmMatrixLayout
+enum struct GemmMatrixLayout
 {
    MK_KN_MN, // 0
    MK_NK_MN, // 1
@@ -18,7 +18,7 @@ enum GemmMatrixLayout
    KM_NK_NM, // 7
 };

-enum GemmDataType
+enum struct GemmDataType
 {
    F32_F32_F32, // 0
    F16_F16_F16, // 1
@@ -43,8 +43,8 @@ int profile_gemm_bias_relu_add(int argc, char* argv[])
        exit(1);
    }

-    const int data_type        = static_cast<GemmDataType>(std::stoi(argv[2]));
-    const int layout           = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);

--- a/profiler/src/profile_gemm_reduce.cpp
+++ b/profiler/src/profile_gemm_reduce.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_gemm_reduce_impl.hpp"
+
+int profile_gemm_reduce(int argc, char* argv[])
+{
+    enum struct GemmMatrixLayout_t
+    {
+        MK_KN_MN, // 0
+        MK_NK_MN, // 1
+        KM_KN_MN, // 2
+        KM_NK_MN, // 3
+    };
+
+    enum struct GemmReduceDataType_t
+    {
+        F32_F32_F32_F32_F32, // 0
+        F16_F16_F16_F32_F32, // 1
+    };
+
+    if(!(argc == 14 || argc == 15))
+    {
+        printf("arg1: tensor operation (gemm: GEMM+Reduce)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg14: split k into  mulitiple batch\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmReduceDataType_t>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout_t>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const int nrepeat          = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
+       layout == GemmMatrixLayout_t::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_reduce_impl<ck::half_t,
+                                               ck::half_t,
+                                               ck::half_t,
+                                               float,
+                                               ck::tensor_layout::gemm::RowMajor,
+                                               ck::tensor_layout::gemm::RowMajor,
+                                               ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout_t::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_reduce_impl<ck::half_t,
+                                               ck::half_t,
+                                               ck::half_t,
+                                               float,
+                                               ck::tensor_layout::gemm::RowMajor,
+                                               ck::tensor_layout::gemm::ColumnMajor,
+                                               ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout_t::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_reduce_impl<ck::half_t,
+                                               ck::half_t,
+                                               ck::half_t,
+                                               float,
+                                               ck::tensor_layout::gemm::ColumnMajor,
+                                               ck::tensor_layout::gemm::RowMajor,
+                                               ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout_t::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_reduce_impl<ck::half_t,
+                                               ck::half_t,
+                                               ck::half_t,
+                                               float,
+                                               ck::tensor_layout::gemm::ColumnMajor,
+                                               ck::tensor_layout::gemm::ColumnMajor,
+                                               ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+
+    return 1;
+}
--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_grouped_gemm_impl.hpp"
+
+enum GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
+};
+
+enum GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+std::vector<int> argToIntArray(char* input)
+{
+    std::vector<int> out;
+
+    std::istringstream in(input);
+
+    std::string item;
+
+    while(std::getline(in, item, ','))
+    {
+        out.push_back(std::stoi(item));
+    }
+
+    return out;
+}
+
+int profile_grouped_gemm(int argc, char* argv[])
+{
+    if(!(argc == 14))
+    {
+        printf("arg1: tensor operation (grouped_gemm: Grouped GEMM)\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: run kernel # of times (>1)\n");
+        printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
+               "64,64 64,64 128,128)\n");
+        exit(1);
+    }
+
+    const int data_type        = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const int layout           = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const int nrepeat          = std::stoi(argv[7]);
+
+    const auto Ms = argToIntArray(argv[8]);
+    const auto Ns = argToIntArray(argv[9]);
+    const auto Ks = argToIntArray(argv[10]);
+
+    const auto StrideAs = argToIntArray(argv[11]);
+    const auto StrideBs = argToIntArray(argv[12]);
+    const auto StrideCs = argToIntArray(argv[13]);
+
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
+                                                ck::half_t,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   nrepeat,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
+                                                ck::half_t,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   nrepeat,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
+                                                ck::half_t,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   nrepeat,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
+                                                ck::half_t,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   nrepeat,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
+    }
+
+    return 1;
+}
--- a/profiler/src/profile_reduce.cpp
+++ b/profiler/src/profile_reduce.cpp
@@ -34,6 +34,8 @@ static struct option long_options[] = {{"inLengths", required_argument, nullptr,
                                       {"scales", required_argument, nullptr, 'S'},
                                       {"half", no_argument, nullptr, '?'},
                                       {"double", no_argument, nullptr, '?'},
+                                       {"int8", no_argument, nullptr, '?'},
+                                       {"bf16", no_argument, nullptr, '?'},
                                       {"dumpout", required_argument, nullptr, 'o'},
                                       {"verify", required_argument, nullptr, 'v'},
                                       {"log", required_argument, nullptr, 'l'},
@@ -82,7 +84,7 @@ static std::vector<T> getTypeValuesFromString(const char* cstr_values)
    return (values);
 }

-typedef enum
+enum struct appDataType_t
 {
    appHalf     = 0,
    appFloat    = 1,
@@ -91,7 +93,7 @@ typedef enum
    appInt8x4   = 4,
    appBFloat16 = 5,
    appDouble   = 6,
-} appDataType_t;
+};

 static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
 {
@@ -119,6 +121,8 @@ class AppArgs
    public:
    bool use_half   = false;
    bool use_double = false;
+    bool use_int8   = false;
+    bool use_bf16   = false;

    std::vector<size_t> inLengths;
    std::vector<size_t> outLengths;
@@ -127,8 +131,8 @@ class AppArgs
    std::vector<float> scales;

    ReduceTensorOp_t reduceOp = ReduceTensorOp_t::ADD;
-    appDataType_t compTypeId  = appFloat;
-    appDataType_t outTypeId   = appFloat;
+    appDataType_t compTypeId  = appDataType_t::appFloat;
+    appDataType_t outTypeId   = appDataType_t::appFloat;

    bool compType_assigned = false;
    bool outType_assigned  = false;
@@ -169,6 +173,8 @@ class AppArgs
                  << std::endl;
        std::cout << "--half, use fp16 for the input and output tensor data types" << std::endl;
        std::cout << "--double, use fp64 for the input and output tensor data types" << std::endl;
+        std::cout << "--int8, use int8 for the input and output tensor data types" << std::endl;
+        std::cout << "--bf16, use bfloat16 for the input and output tensor data types" << std::endl;
        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
                     "comparing with the host-based reduction"
                  << std::endl;
@@ -267,6 +273,10 @@ class AppArgs
                    use_half = true;
                else if(std::string(long_options[option_index].name) == "double")
                    use_double = true;
+                else if(std::string(long_options[option_index].name) == "int8")
+                    use_int8 = true;
+                else if(std::string(long_options[option_index].name) == "bf16")
+                    use_bf16 = true;
                else if(std::string(long_options[option_index].name) == "help")
                {
                    show_usage(argv[0]);
@@ -329,15 +339,16 @@ int profile_reduce(int argc, char* argv[])
    if(args.use_half)
    {
        if(!args.compType_assigned)
-            args.compTypeId = appHalf;
+            args.compTypeId = appDataType_t::appHalf;

-        if(args.outType_assigned && (args.outTypeId != appHalf && args.outTypeId != appFloat))
-            args.outTypeId = appFloat;
+        if(args.outType_assigned &&
+           (args.outTypeId != appDataType_t::appHalf && args.outTypeId != appDataType_t::appFloat))
+            args.outTypeId = appDataType_t::appFloat;

        if(!args.outType_assigned)
-            args.outTypeId = appHalf;
+            args.outTypeId = appDataType_t::appHalf;

-        if(args.compTypeId == appHalf)
+        if(args.compTypeId == appDataType_t::appHalf)
        {
            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
                                                                    args.init_method,
@@ -352,7 +363,7 @@ int profile_reduce(int argc, char* argv[])
                                                                    args.scales[0],
                                                                    args.scales[1]);
        }
-        else if(args.compTypeId == appFloat)
+        else if(args.compTypeId == appDataType_t::appFloat)
        {
            profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
                                                               args.init_method,
@@ -385,9 +396,76 @@ int profile_reduce(int argc, char* argv[])
                                                    args.scales[0],
                                                    args.scales[1]);
    }
+    else if(args.use_int8)
+    {
+        if(!args.compType_assigned)
+            args.compTypeId = appDataType_t::appInt8;
+
+        if(args.outType_assigned &&
+           (args.outTypeId != appDataType_t::appInt8 && args.outTypeId != appDataType_t::appInt32))
+            args.outTypeId = appDataType_t::appInt32;
+
+        if(!args.outType_assigned)
+            args.outTypeId = appDataType_t::appInt8;
+
+        if(args.compTypeId == appDataType_t::appInt8)
+        {
+            profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
+                                                        args.init_method,
+                                                        args.do_log,
+                                                        args.do_dumpout,
+                                                        args.nrepeat,
+                                                        args.inLengths,
+                                                        args.reduceDims,
+                                                        args.reduceOp,
+                                                        args.nanOpt,
+                                                        args.indicesOpt,
+                                                        args.scales[0],
+                                                        args.scales[1]);
+        }
+        else if(args.compTypeId == appDataType_t::appInt32)
+        {
+            profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
+                                                         args.init_method,
+                                                         args.do_log,
+                                                         args.do_dumpout,
+                                                         args.nrepeat,
+                                                         args.inLengths,
+                                                         args.reduceDims,
+                                                         args.reduceOp,
+                                                         args.nanOpt,
+                                                         args.indicesOpt,
+                                                         args.scales[0],
+                                                         args.scales[1]);
+        }
+        else
+            throw std::runtime_error("Invalid compType assignment!");
+    }
+    else if(args.use_bf16)
+    {
+        if(args.outType_assigned && (args.outTypeId != appDataType_t::appBFloat16 &&
+                                     args.outTypeId != appDataType_t::appFloat))
+            args.outTypeId = appDataType_t::appFloat;
+
+        if(!args.outType_assigned)
+            args.outTypeId = appDataType_t::appBFloat16;
+
+        profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
+                                                             args.init_method,
+                                                             args.do_log,
+                                                             args.do_dumpout,
+                                                             args.nrepeat,
+                                                             args.inLengths,
+                                                             args.reduceDims,
+                                                             args.reduceOp,
+                                                             args.nanOpt,
+                                                             args.indicesOpt,
+                                                             args.scales[0],
+                                                             args.scales[1]);
+    }
    else
    {
-        if(args.compTypeId == appFloat)
+        if(args.compTypeId == appDataType_t::appFloat)
        {
            profile_reduce_impl<float, float, float>(args.do_verification,
                                                     args.init_method,
@@ -402,7 +480,7 @@ int profile_reduce(int argc, char* argv[])
                                                     args.scales[0],
                                                     args.scales[1]);
        }
-        else if(args.compTypeId == appDouble)
+        else if(args.compTypeId == appDataType_t::appDouble)
        {
            profile_reduce_impl<float, double, float>(args.do_verification,
                                                      args.init_method,

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -5,10 +5,12 @@
 #include <cstring>

 int profile_gemm(int, char*[]);
-int profile_batched_gemm(int, char*[]);
 int profile_gemm_bias_2d(int, char*[]);
 int profile_gemm_bias_relu(int, char*[]);
 int profile_gemm_bias_relu_add(int, char*[]);
+int profile_gemm_reduce(int, char*[]);
+int profile_batched_gemm(int, char*[]);
+int profile_grouped_gemm(int, char*[]);
 int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
@@ -34,10 +36,18 @@ int main(int argc, char* argv[])
    {
        return profile_gemm_bias_relu_add(argc, argv);
    }
+    else if(strcmp(argv[1], "gemm_reduce") == 0)
+    {
+        return profile_gemm_reduce(argc, argv);
+    }
    else if(strcmp(argv[1], "batched_gemm") == 0)
    {
        return profile_batched_gemm(argc, argv);
    }
+    else if(strcmp(argv[1], "grouped_gemm") == 0)
+    {
+        profile_grouped_gemm(argc, argv);
+    }
    else if(strcmp(argv[1], "conv_fwd") == 0)
    {
        return profile_conv_fwd(argc, argv);
@@ -69,12 +79,14 @@ int main(int argc, char* argv[])
               "                        gemm_bias_2d: GEMM+Bias(2D)\n"
               "                        gemm_bias_relu: GEMM+Bias+ReLU\n"
               "                        gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n"
+               "                        gemm_reduce: GEMM+Reduce\n"
+               "                        grouped_gemm: Grouped Gemm\n"
               "                        conv_fwd: ForwardConvolution\n"
               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
               "                        conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
               "                        conv_bwd: BackwardConvolution\n"
-               "                        reduce: REDUCE\n");
+               "                        reduce: Reduce\n");
        // clang-format on

        return 0;

--- a/script/cmake-rocm.sh
+++ b/script/cmake-rocm.sh
@@ -3,14 +3,14 @@ rm -f CMakeCache.txt
 rm -f *.cmake
 rm -rf CMakeFiles

-MY_PROJECT_SOURCE=../../..
+MY_PROJECT_SOURCE=../
 MY_PROJECT_INSTALL=../install.dir

 cmake                                                                                                                                          \
 -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL}                                                                                                  \
 -D BUILD_DEV=OFF                                                                                                                               \
 -D CMAKE_BUILD_TYPE=Release                                                                                                                    \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only -save-temps=$PWD"   \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 -ftemplate-backtrace-limit=0 -mllvm --amdgpu-spill-vgpr-to-agpr=0 -gline-tables-only "   \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                                                                      \
 -D CMAKE_PREFIX_PATH=/opt/rocm                                                                                                                 \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                                                              \

--- a/script/profile_reduce_no_index.sh
+++ b/script/profile_reduce_no_index.sh
@@ -3,13 +3,16 @@
 PRECISION=
 ##PRECISION=--half
 ##PRECISION=--double
+##PRECISION=--int8
+##PRECISION=--bf16

-if test -n $PRECISION && test "$PRECISION" = "--half"; then 
+if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then
   ACCTYPE="-C 1"
-else
-   ACCTYPE=""
+elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
+   ACCTYPE="-C 2"
 fi

+
 driver="./bin/ckProfiler"

 VERIFY="-v $1"
@@ -20,10 +23,16 @@ NREPEAT=$3
 #### 0 - ADD,  5 - AVG,  7 - NORM2
 Operations="0 5 7"

+#### 0 - ADD,  5 - AVG,    for int8, no NORM2 supported
+if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
+   Operations=5
+fi
+
 ## for generic validation
 for op in $Operations; do
    set -x
    #######        datatype   layout          reduce dims  op     acctype   verify  init  repeats
+    $driver reduce $PRECISION -D 64,4,280,82  -R 0,1,2,3   -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
    $driver reduce $PRECISION -D 64,4,280,82  -R 0         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
    $driver reduce $PRECISION -D 64,4,280,82  -R 1         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
    $driver reduce $PRECISION -D 64,4,280,82  -R 2         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT

--- a/script/profile_reduce_with_index.sh
+++ b/script/profile_reduce_with_index.sh
@@ -3,6 +3,8 @@
 PRECISION=
 ##PRECISION=--half
 ##PRECISION=--double
+##PRECISION=--int8
+##PRECISION=--bf16

 driver="./bin/ckProfiler"

@@ -18,6 +20,7 @@ for op in $Operations; do
    for use_idx in 0 1; do
        set -x
        #######        datatype   layout          reduce dims  op     use index    verify  init  repeats
+        $driver reduce $PRECISION -D 64,4,280,82  -R 0,1,2,3   -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
        $driver reduce $PRECISION -D 64,4,280,82  -R 0         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
        $driver reduce $PRECISION -D 64,4,280,82  -R 1         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
        $driver reduce $PRECISION -D 64,4,280,82  -R 2         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT

--- a/script/test_convnd_fwd.sh
+++ b/script/test_convnd_fwd.sh
+#!/usr/bin/env bash
+
+# set -e
+
+DIM1=False
+DIM2=True
+DIM3=False
+DATE=220317
+GIT_HASH=4e6dfda
+LOG_DIR=${DATE}_${GIT_HASH}
+SUFFIX=${GIT_HASH}
+
+
+#--------------------------------------------------------------------------
+#   Commandline arguments parsing
+#   like: cmd -key[--key] value
+#--------------------------------------------------------------------------
+
+POSITIONAL=()
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+    -d1|--d1)
+    DIM1=True
+    echo DIM1: "${DIM1}"
+    shift # past argument
+    ;;
+    -d2|--d2)
+    DIM2=True
+    echo DIM2: "${DIM2}"
+    shift # past argument
+    ;;
+    -d3|--d3)
+    DIM3=True
+    echo DIM3: "${DIM3}"
+    shift # past argument
+    ;;
+    -all|--all)
+    DIM1=True
+    DIM2=True
+    DIM3=True
+    echo DIM1: "${DIM1}"
+    echo DIM2: "${DIM2}"
+    echo DIM3: "${DIM3}"
+    shift # past argument
+    ;;
+    -s|--suffix)
+    SUFFIX=${SUFFIX}_"$2"
+    echo SUFFIX: "${SUFFIX}"
+    shift # past argument
+    shift # past value
+    ;;
+    *)    # unknown option
+    POSITIONAL+=("$1") # save it in an array for later
+    shift # past argument
+    ;;
+esac
+done
+set -- "${POSITIONAL[@]}" # restore positional parameters
+
+#--------------------------------------------------------------------------
+
+# NUMACTL="numactl --cpunodebind=1 --membind=1"
+NUMACTL=
+# ENV_CONF=
+GPU=mi100
+PROF_ITER_COUNT=10000
+LOG_DIR_PATH=../log/${LOG_DIR}
+set -x
+
+#-------------------------------------------------------------------------------
+#               1D
+#-------------------------------------------------------------------------------
+
+if [[ "${DIM1}" == "True" ]]; then
+    mkdir -p ${LOG_DIR_PATH}
+    echo ">>>>>>>> RUN test conv1d nwc <<<<<<<<<<"
+    CMD="./../build/bin/test_conv1d_fwd"
+    ${NUMACTL} ${CMD} 2>&1 \
+        | tee ${LOG_DIR_PATH}/test_conv1d_fwd_nwc_${SUFFIX}_${GPU}.log
+
+fi
+
+#-------------------------------------------------------------------------------
+#               2D
+#-------------------------------------------------------------------------------
+
+if [[ "${DIM2}" == "True" ]]; then
+    mkdir -p ${LOG_DIR_PATH}
+    echo ">>>>>>>> RUN test conv2d nhwc <<<<<<<<<<"
+    CMD="./../build/bin/test_conv2d_fwd"
+    ${NUMACTL} ${CMD} 2>&1 \
+        | tee ${LOG_DIR_PATH}/test_conv2d_fwd_nhwc_${SUFFIX}_${GPU}.log
+
+fi
+
+#-------------------------------------------------------------------------------
+#               3D
+#-------------------------------------------------------------------------------
+
+if [[ "${DIM3}" == "True" ]]; then
+    mkdir -p ${LOG_DIR_PATH}
+    echo ">>>>>>>> RUN test conv3d ndhwc <<<<<<<<<<"
+    CMD="./../build/bin/test_conv3d_fwd"
+    ${NUMACTL} ${CMD} 2>&1 \
+        | tee ${LOG_DIR_PATH}/test_conv3d_fwd_ndhwc_${SUFFIX}_${GPU}.log
+
+fi
--- a/script/test_reduce_no_index.sh
+++ b/script/test_reduce_no_index.sh
+#!/bin/bash
+
+## The following will be used for CI
+
+set -x
+
+## for float
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  0 2
+
+## for float16
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  1 2
+
+## for int8_t
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  3 2
+
+## for bfloat16
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  5 2
+
+set +x
+
--- a/script/test_reduce_with_index.sh
+++ b/script/test_reduce_with_index.sh
+#!/bin/bash
+
+## The following will be used for CI
+
+set -x
+
+## for float
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
+
+## for float16
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  1 2
+
+## for int8_t
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  3 2
+
+## for bfloat16
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  5 2
+
+set +x
+
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -17,6 +17,7 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu
    ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/gpu
    ${PROJECT_SOURCE_DIR}/test/include
+    ${PROJECT_SOURCE_DIR}/profiler/include
    ${PROJECT_SOURCE_DIR}/external/include/half
 )

@@ -37,7 +38,10 @@ add_subdirectory(conv_util)
 add_subdirectory(reference_conv_fwd)
 add_subdirectory(gemm)
 add_subdirectory(gemm_split_k)
-add_subdirectory(conv2d_fwd)
+add_subdirectory(gemm_reduce)
+add_subdirectory(batched_gemm)
+add_subdirectory(grouped_gemm)
 add_subdirectory(convnd_fwd)
 add_subdirectory(conv2d_bwd_data)
+add_subdirectory(reduce)
 add_subdirectory(cpu_ukernel)
--- a/test/batched_gemm/CMakeLists.txt
+++ b/test/batched_gemm/CMakeLists.txt
+add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
+target_link_libraries(test_batched_gemm_fp16 PRIVATE host_tensor)
+target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
+
--- a/test/batched_gemm/batched_gemm_fp16.cpp
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
+#include <half.hpp>
+#include <tuple>
+#include <vector>
+
+#include "batched_gemm_util.hpp"
+#include "reference_batched_gemm.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_batched_gemm_xdl.hpp"
+#include "element_wise_operation.hpp"
+#include "test_util.hpp"
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using DeviceBatchedGemmPtr =
+    ck::tensor_operation::device::DeviceGemmPtr<ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>;
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_batched_gemm_instance {
+void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<DeviceBatchedGemmPtr>& instances);
+}
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace {
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+auto PrepareGemmTensor(const std::size_t batch_count,
+                       const ck::batched_gemm_util::GemmParams& params)
+{
+    auto f_host_tensor_descriptor =
+        [batch_count](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                            std::vector<std::size_t>({row * stride, stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                            std::vector<std::size_t>({col * stride, 1, stride}));
+            }
+        };
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+    Tensor<CDataType> c_g_m_n_host_result(
+        f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_n_device_result(
+        f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+
+    a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
+    b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+
+    return std::make_tuple(a_g_m_k, b_g_k_n, c_g_m_n_host_result, c_g_m_n_device_result);
+}
+
+bool TestBatchedGemm(const std::size_t batch_count, DeviceBatchedGemmPtr& gemmPtr)
+{
+    // Arrange
+    ck::batched_gemm_util::GemmParams params;
+    params.M       = 1024;
+    params.N       = 1024;
+    params.K       = 1024;
+    params.StrideA = 1024;
+    params.StrideB = 1024;
+    params.StrideC = 1024;
+
+    auto host_tensors           = PrepareGemmTensor(batch_count, params);
+    const Tensor<ADataType>& a  = std::get<0>(host_tensors);
+    const Tensor<BDataType>& b  = std::get<1>(host_tensors);
+    Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
+    Tensor<CDataType>& c_device = std::get<3>(host_tensors);
+
+    auto a_element_op = PassThrough{};
+    auto b_element_op = PassThrough{};
+    auto c_element_op = PassThrough{};
+
+    using ReferenceBatchedGemmInstance =
+        ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                         BDataType,
+                                                         CDataType,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         PassThrough>;
+    ck::batched_gemm_util::RunHostBatchedGemm<ReferenceBatchedGemmInstance>(
+        a, b, c_host, a_element_op, b_element_op, c_element_op);
+
+    // Act
+    ck::batched_gemm_util::RunDeviceBatchedGemm(
+        gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op);
+
+    // Assert
+    // bool pass = test::check_err(
+    // c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+    bool pass = check_error(c_device, c_host) < 0.007815f;
+
+    std::cout << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+
+    return pass;
+}
+} // namespace
+
+int main()
+{
+    std::vector<DeviceBatchedGemmPtr> batched_gemm_ptrs;
+    ck::tensor_operation::device::device_batched_gemm_instance::
+        add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(batched_gemm_ptrs);
+
+    bool pass = true;
+
+    const std::size_t batch_count = 4;
+    for(auto& gemmPtr : batched_gemm_ptrs)
+    {
+        pass &= TestBatchedGemm(batch_count, gemmPtr);
+    }
+
+    std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+
+    return pass ? 0 : 1;
+}
--- a/test/batched_gemm/batched_gemm_util.hpp
+++ b/test/batched_gemm/batched_gemm_util.hpp
+#ifndef BATCHED_GEMM_UTILS_HPP
+#define BATCHED_GEMM_UTILS_HPP
+
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+
+namespace ck {
+namespace batched_gemm_util {
+
+struct GemmParams
+{
+    GemmParams()
+        : M(1024), N(1024), K(1024), StrideA(1024), StrideB(1024), StrideC(1024), alpha(1), beta(0)
+    {
+    }
+
+    ck::index_t M;
+    ck::index_t N;
+    ck::index_t K;
+
+    ck::index_t StrideA;
+    ck::index_t StrideB;
+    ck::index_t StrideC;
+
+    float alpha;
+    float beta;
+};
+
+template <typename BatchedGemmInstance,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void RunHostBatchedGemm(const Tensor<ADataType>& A,
+                        const Tensor<BDataType>& B,
+                        Tensor<CDataType>& C,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op)
+{
+    auto ref_batched_gemm = BatchedGemmInstance{};
+    auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+    auto ref_argument =
+        ref_batched_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
+
+    ref_invoker.Run(ref_argument);
+}
+
+template <typename DeviceGemmPtr,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void RunDeviceBatchedGemm(DeviceGemmPtr& batched_gemm_ptr,
+                          const ck::batched_gemm_util::GemmParams& params,
+                          const Tensor<ADataType>& A,
+                          const Tensor<BDataType>& B,
+                          Tensor<CDataType>& C,
+                          AElementwiseOperation a_element_op,
+                          BElementwiseOperation b_element_op,
+                          CElementwiseOperation c_element_op)
+{
+    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpace());
+    DeviceMem b_g_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpace());
+    DeviceMem c_g_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpace());
+
+    a_g_m_k_device_buf.ToDevice(A.mData.data());
+    b_g_k_n_device_buf.ToDevice(B.mData.data());
+
+    const auto batch_count = A.mDesc.GetLengths()[0];
+    auto invoker_ptr       = batched_gemm_ptr->MakeInvokerPointer();
+    auto argument_ptr      = batched_gemm_ptr->MakeArgumentPointer(
+        static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_g_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_g_m_n_device_buf.GetDeviceBuffer()),
+        params.M,
+        params.N,
+        params.K,
+        params.StrideA,
+        params.StrideB,
+        params.StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        batch_count);
+
+    if(!batched_gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    invoker_ptr->Run(argument_ptr.get());
+    c_g_m_n_device_buf.FromDevice(C.mData.data());
+}
+
+} // namespace batched_gemm_util
+} // namespace ck
+#endif
--- a/test/conv2d_bwd_data/conv2d_bwd_data.cpp
+++ b/test/conv2d_bwd_data/conv2d_bwd_data.cpp
@@ -182,8 +182,8 @@ int main(int argc, char* argv[])

        out_device_buf.ToDevice(out_n_k_ho_wo.mData.data());
        wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-
-        in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{5});
+        // reset input to zero
+        in_n_c_hi_wi_device_result.GenerateTensorValue(GeneratorTensor_1<InDataType>{0});
        in_device_buf.ToDevice(in_n_c_hi_wi_device_result.mData.data());

        // get host result
@@ -225,9 +225,9 @@ int main(int argc, char* argv[])
            ck::tensor_operation::device::device_conv2d_bwd_data_instance::
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
        }
-        else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ushort> &&
-                          ck::is_same_v<ck::remove_cv_t<WeiDataType>, ushort> &&
-                          ck::is_same_v<ck::remove_cv_t<OutDataType>, ushort>)
+        else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::bhalf_t> &&
+                          ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::bhalf_t> &&
+                          ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::bhalf_t>)
        {
            ck::tensor_operation::device::device_conv2d_bwd_data_instance::
                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);

--- a/test/conv2d_fwd/CMakeLists.txt
+++ b/test/conv2d_fwd/CMakeLists.txt
-add_test_executable(test_conv2d_fwd conv2d_fwd.cpp)
-target_link_libraries(test_conv2d_fwd PRIVATE host_tensor)
-target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance)
--- a/test/conv2d_fwd/conv2d_fwd.cpp
+++ b/test/conv2d_fwd/conv2d_fwd.cpp
-#include "config.hpp"
-#include "device.hpp"
-#include "host_tensor.hpp"
-#include "host_tensor_generator.hpp"
-#include "host_conv.hpp"
-#include "tensor_layout.hpp"
-#include "device_tensor.hpp"
-#include "device_conv_fwd.hpp"
-#include "element_wise_operation.hpp"
-#include "reference_conv_fwd.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace device_conv2d_fwd_instance {
-
-using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr<ck::tensor_operation::element_wise::PassThrough,
-                                              ck::tensor_operation::element_wise::PassThrough,
-                                              ck::tensor_operation::element_wise::PassThrough>;
-
-void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-
-void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-
-void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
-    std::vector<DeviceConvFwdNoOpPtr>&);
-
-void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-
-void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector<DeviceConvFwdNoOpPtr>&);
-} // namespace device_conv2d_fwd_instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
-
-using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
-using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
-
-template <typename T>
-static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    float max_diff = 1e-6;
-
-    for(int i = 0; i < ref.mData.size(); ++i)
-    {
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-int main(int argc, char* argv[])
-{
-    int data_type   = 0;
-    int init_method = 0;
-
-    // Conv shape
-    ck::index_t N               = 128;
-    ck::index_t K               = 256;
-    ck::index_t C               = 192;
-    ck::index_t Y               = 3;
-    ck::index_t X               = 3;
-    ck::index_t Hi              = 71;
-    ck::index_t Wi              = 71;
-    ck::index_t conv_stride_h   = 2;
-    ck::index_t conv_stride_w   = 2;
-    ck::index_t conv_dilation_h = 1;
-    ck::index_t conv_dilation_w = 1;
-    ck::index_t in_left_pad_h   = 1;
-    ck::index_t in_left_pad_w   = 1;
-    ck::index_t in_right_pad_h  = 1;
-    ck::index_t in_right_pad_w  = 1;
-    if(argc == 1)
-    {
-        data_type   = 1;
-        init_method = 1;
-    }
-    else if(argc == 3)
-    {
-        data_type   = std::stoi(argv[1]);
-        init_method = std::stoi(argv[2]);
-    }
-    else if(argc == 18)
-    {
-        data_type   = std::stoi(argv[1]);
-        init_method = std::stoi(argv[2]);
-
-        N               = std::stoi(argv[3]);
-        K               = std::stoi(argv[4]);
-        C               = std::stoi(argv[5]);
-        Y               = std::stoi(argv[6]);
-        X               = std::stoi(argv[7]);
-        Hi              = std::stoi(argv[8]);
-        Wi              = std::stoi(argv[9]);
-        conv_stride_h   = std::stoi(argv[10]);
-        conv_stride_w   = std::stoi(argv[11]);
-        conv_dilation_h = std::stoi(argv[12]);
-        conv_dilation_w = std::stoi(argv[13]);
-        in_left_pad_h   = std::stoi(argv[14]);
-        in_left_pad_w   = std::stoi(argv[15]);
-        in_right_pad_h  = std::stoi(argv[16]);
-        in_right_pad_w  = std::stoi(argv[17]);
-    }
-    else
-    {
-        printf("arg1: data type (0=fp32, 1=fp16, 2= bfp16, 3= int8_t )\n");
-        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
-        printf("arg3 to 17: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
-               "RightPx\n");
-        exit(1);
-    }
-
-    auto Run = [&](auto input_type, auto wei_type, auto out_type) {
-        using InDataType  = decltype(input_type);
-        using WeiDataType = decltype(wei_type);
-        using OutDataType = decltype(out_type);
-
-        using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
-                                                                                      WeiDataType,
-                                                                                      OutDataType,
-                                                                                      InElementOp,
-                                                                                      WeiElementOp,
-                                                                                      OutElementOp>;
-
-        const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
-        const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
-
-        const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
-        const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
-
-        const std::vector<ck::index_t> input_spatial_lengths{Hi, Wi};
-        const std::vector<ck::index_t> filter_spatial_lengths{Y, X};
-        const std::vector<ck::index_t> output_spatial_lengths{Ho, Wo};
-        const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
-        const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
-        const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
-        const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
-
-        auto f_host_tensor_descriptor =
-            [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
-                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
-                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
-            };
-
-        Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi));
-        Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X));
-        Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
-        Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
-
-        std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
-        std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
-        std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
-
-        switch(init_method)
-        {
-        case 0: break;
-        case 1:
-            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
-            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
-            break;
-        default:
-            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0, 1});
-            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1, 1});
-        }
-
-        DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
-        DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
-        DeviceMem out_device_buf(sizeof(OutDataType) *
-                                 out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
-
-        in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
-        wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
-
-        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-
-        using DeviceConvFwdNoOpPtr =
-            ck::tensor_operation::device::DeviceConvFwdPtr<PassThrough, PassThrough, PassThrough>;
-
-        // add device Conv instances
-        std::vector<DeviceConvFwdNoOpPtr> conv_ptrs;
-
-        if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, float> &&
-                     ck::is_same_v<ck::remove_cv_t<WeiDataType>, float> &&
-                     ck::is_same_v<ck::remove_cv_t<OutDataType>, float>)
-        {
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
-                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
-        }
-        else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
-                          ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
-                          ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
-        {
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
-                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
-
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
-                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
-        }
-        else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::bhalf_t> &&
-                          ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::bhalf_t> &&
-                          ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::bhalf_t>)
-        {
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
-                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
-        }
-        else if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, int8_t> &&
-                          ck::is_same_v<ck::remove_cv_t<WeiDataType>, int8_t> &&
-                          ck::is_same_v<ck::remove_cv_t<OutDataType>, int8_t>)
-        {
-            ck::tensor_operation::device::device_conv2d_fwd_instance::
-                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
-        }
-
-        if(conv_ptrs.size() <= 0)
-        {
-            throw std::runtime_error("wrong! no device Conv instance found");
-        }
-
-        auto ref_conv    = ReferenceConvFwdInstance{};
-        auto ref_invoker = ref_conv.MakeInvoker();
-
-        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
-                                                  wei_k_c_y_x,
-                                                  out_n_k_ho_wo_host_result,
-                                                  conv_filter_strides,
-                                                  conv_filter_dilations,
-                                                  input_left_pads,
-                                                  input_right_pads,
-                                                  InElementOp{},
-                                                  WeiElementOp{},
-                                                  OutElementOp{});
-
-        ref_invoker.Run(ref_argument);
-
-        // profile device Conv instances
-        bool success = false;
-        for(auto& conv_ptr : conv_ptrs)
-        {
-            auto argument_ptr = conv_ptr->MakeArgumentPointer(
-                static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
-                static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
-                static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                N,
-                K,
-                C,
-                input_spatial_lengths,
-                filter_spatial_lengths,
-                output_spatial_lengths,
-                conv_filter_strides,
-                conv_filter_dilations,
-                input_left_pads,
-                input_right_pads,
-                PassThrough{},
-                PassThrough{},
-                PassThrough{});
-
-            auto invoker_ptr = conv_ptr->MakeInvokerPointer();
-
-            if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
-            {
-                invoker_ptr->Run(argument_ptr.get(), 0);
-
-                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
-                if(!check_out(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result))
-                {
-                    success = false;
-                    break;
-                }
-                success = true;
-            }
-        }
-
-        if(success)
-        {
-            std::cout << "test conv2d fwd : Pass" << std::endl;
-            return 0;
-        }
-        else
-        {
-            std::cout << "test conv2d fwd: Fail " << std::endl;
-            return -1;
-        }
-    };
-    int res = -1;
-    if(data_type == 0)
-    {
-        res = Run(float(), float(), float());
-    }
-    else if(data_type == 1)
-    {
-        res = Run(ck::half_t(), ck::half_t(), ck::half_t());
-    }
-    else if(data_type == 2)
-    {
-        Run(ck::bhalf_t(), ck::bhalf_t(), ck::bhalf_t());
-    }
-    else if(data_type == 3)
-    {
-        res = Run(int8_t(), int8_t(), int8_t());
-    }
-
-    return res;
-}
--- a/test/conv_util/conv_util.cpp
+++ b/test/conv_util/conv_util.cpp
@@ -5,33 +5,10 @@
 #include "config.hpp"
 #include "conv_utils.hpp"
 #include "tensor_layout.hpp"
+#include "test_util.hpp"

 namespace {

-template <typename T>
-bool cmp_vec(const std::vector<T>& out, const std::vector<T>& ref, const std::string& msg)
-{
-    if(out.size() != ref.size())
-    {
-        std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
-                  << std::endl
-                  << msg << std::endl;
-        return false;
-    }
-
-    for(std::size_t i = 0; i < ref.size(); ++i)
-    {
-        if(out[i] != ref[i])
-        {
-            std::cout << "out[" << i << "] != ref[" << i << "]: " << out[i] << "!=" << ref[i]
-                      << std::endl
-                      << msg << std::endl;
-            return false;
-        }
-    }
-    return true;
-}
-
 bool TestConvParams_GetOutputSpatialLengths()
 {
    bool res{true};
@@ -43,26 +20,26 @@ bool TestConvParams_GetOutputSpatialLengths()
    // padding {{1,1}, {1,1}}
    ck::conv_util::ConvParams conv_params;
    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
-    res                                      = cmp_vec(out_spatial_len,
-                  std::vector<ck::index_t>{36, 36},
-                  "Error: ConvParams 2D default constructor.");
+    res                                      = test::check_err(out_spatial_len,
+                          std::vector<ck::index_t>{36, 36},
+                          "Error: ConvParams 2D default constructor.");

    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = cmp_vec(
+    res                             = test::check_err(
        out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}.");

    conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
    conv_params.input_left_pads     = std::vector<ck::index_t>{2, 2};
    conv_params.input_right_pads    = std::vector<ck::index_t>{2, 2};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = cmp_vec(out_spatial_len,
-                  std::vector<ck::index_t>{37, 37},
-                  "Error: ConvParams 2D padding left/right {2,2}.");
+    res                             = test::check_err(out_spatial_len,
+                          std::vector<ck::index_t>{37, 37},
+                          "Error: ConvParams 2D padding left/right {2,2}.");

    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = cmp_vec(
+    res                               = test::check_err(
        out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}.");

    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3, 3};
@@ -70,9 +47,9 @@ bool TestConvParams_GetOutputSpatialLengths()
    conv_params.input_right_pads      = std::vector<ck::index_t>{1, 1};
    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = cmp_vec(out_spatial_len,
-                  std::vector<ck::index_t>{23, 23},
-                  "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");
+    res                               = test::check_err(out_spatial_len,
+                          std::vector<ck::index_t>{23, 23},
+                          "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");

    // -------------------------- 1D ------------------------------------
    conv_params.num_dim_spatial        = 1;
@@ -84,25 +61,24 @@ bool TestConvParams_GetOutputSpatialLengths()
    conv_params.input_right_pads       = std::vector<ck::index_t>{1};

    out_spatial_len = conv_params.GetOutputSpatialLengths();
-    res             = cmp_vec(
-        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D default constructor.");
+    res = test::check_err(out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D.");

    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res =
-        cmp_vec(out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}.");
+    res                             = test::check_err(
+        out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}.");

    conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
    conv_params.input_left_pads     = std::vector<ck::index_t>{2};
    conv_params.input_right_pads    = std::vector<ck::index_t>{2};
    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
-    res                             = cmp_vec(out_spatial_len,
-                  std::vector<ck::index_t>{37},
-                  "Error: ConvParams 1D padding left/right {2}.");
+    res                             = test::check_err(out_spatial_len,
+                          std::vector<ck::index_t>{37},
+                          "Error: ConvParams 1D padding left/right {2}.");

    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = cmp_vec(
+    res                               = test::check_err(
        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}.");

    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3};
@@ -110,9 +86,52 @@ bool TestConvParams_GetOutputSpatialLengths()
    conv_params.input_right_pads      = std::vector<ck::index_t>{1};
    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
-    res                               = cmp_vec(out_spatial_len,
-                  std::vector<ck::index_t>{23},
-                  "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
+    res                               = test::check_err(out_spatial_len,
+                          std::vector<ck::index_t>{23},
+                          "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
+
+    // -------------------------- 3D ------------------------------------
+    conv_params.num_dim_spatial        = 3;
+    conv_params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3, 3};
+    conv_params.input_spatial_lengths  = std::vector<ck::index_t>{71, 71, 71};
+    conv_params.conv_filter_strides    = std::vector<ck::index_t>{2, 2, 2};
+    conv_params.conv_filter_dilations  = std::vector<ck::index_t>{1, 1, 1};
+    conv_params.input_left_pads        = std::vector<ck::index_t>{1, 1, 1};
+    conv_params.input_right_pads       = std::vector<ck::index_t>{1, 1, 1};
+
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    res             = test::check_err(
+        out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D.");
+
+    conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1, 1};
+    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    res                             = test::check_err(out_spatial_len,
+                          std::vector<ck::index_t>{71, 71, 71},
+                          "Error: ConvParams 3D stride {1, 1, 1}.");
+
+    conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2, 2};
+    conv_params.input_left_pads     = std::vector<ck::index_t>{2, 2, 2};
+    conv_params.input_right_pads    = std::vector<ck::index_t>{2, 2, 2};
+    out_spatial_len                 = conv_params.GetOutputSpatialLengths();
+    res                             = test::check_err(out_spatial_len,
+                          std::vector<ck::index_t>{37, 37, 37},
+                          "Error: ConvParams 3D padding left/right {2, 2, 2}.");
+
+    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
+    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    res                               = test::check_err(out_spatial_len,
+                          std::vector<ck::index_t>{36, 36, 36},
+                          "Error: ConvParams 3D dilation {2, 2, 2}.");
+
+    conv_params.conv_filter_strides   = std::vector<ck::index_t>{3, 3, 3};
+    conv_params.input_left_pads       = std::vector<ck::index_t>{1, 1, 1};
+    conv_params.input_right_pads      = std::vector<ck::index_t>{1, 1, 1};
+    conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2, 2};
+    out_spatial_len                   = conv_params.GetOutputSpatialLengths();
+    res                               = test::check_err(
+        out_spatial_len,
+        std::vector<ck::index_t>{23, 23, 23},
+        "Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}.");

    return res;
 }
@@ -123,23 +142,44 @@ bool TestGetHostTensorDescriptor()
    namespace tl = ck::tensor_layout::convolution;
    std::vector<std::size_t> dims{2, 3, 4, 5};
    HostTensorDescriptor h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
-    res = cmp_vec(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
-    res =
-        cmp_vec(h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!");
+    res = test::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
+    res = test::check_err(
+        h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!");

    h   = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCHW{});
-    res = cmp_vec(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
-    res =
-        cmp_vec(h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!");
+    res = test::check_err(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
+    res = test::check_err(
+        h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!");

    dims = std::vector<std::size_t>{2, 3, 4};
    h    = ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{});
-    res  = cmp_vec(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
-    res  = cmp_vec(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");
+    res  = test::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
+    res  = test::check_err(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");

    h   = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCW{});
-    res = cmp_vec(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
-    res = cmp_vec(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
+    res = test::check_err(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
+    res = test::check_err(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
+
+    dims = std::vector<std::size_t>{2, 3, 4, 5, 6};
+    h    = ck::conv_util::GetHostTensorDescriptor(dims, tl::NDHWC{});
+    res  = test::check_err(h.GetLengths(), dims, "Error: wrong NDHWC dimensions lengths!");
+    res  = test::check_err(h.GetStrides(),
+                          {3 * 4 * 5 * 6, // N
+                           1,             // C
+                           3 * 5 * 6,     // D
+                           3 * 6,         // H
+                           3},            // W
+                          "Error: wrong NDHWC dimensions strides!");
+
+    h   = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCDHW{});
+    res = test::check_err(h.GetLengths(), dims, "Error: wrong NCDHW dimensions lengths!");
+    res = test::check_err(h.GetStrides(),
+                          {3 * 4 * 5 * 6, // N
+                           4 * 5 * 6,     // C
+                           5 * 6,         // D
+                           6,             // H
+                           1},            // W
+                          "Error: wrong NCDHW dimensions strides!");

    return res;
 }

--- a/test/convnd_fwd/CMakeLists.txt
+++ b/test/convnd_fwd/CMakeLists.txt
-add_test_executable(test_convnd_fwd convnd_fwd.cpp)
-target_link_libraries(test_convnd_fwd PRIVATE host_tensor)
+add_custom_target(test_convnd_fwd)
+
+add_test_executable(test_conv1d_fwd conv1d_fwd.cpp)
+target_link_libraries(test_conv1d_fwd PRIVATE host_tensor)
+target_link_libraries(test_conv1d_fwd PRIVATE device_conv1d_fwd_instance)
+add_dependencies(test_convnd_fwd test_conv1d_fwd)
+
+add_test_executable(test_conv2d_fwd conv2d_fwd.cpp)
+target_link_libraries(test_conv2d_fwd PRIVATE host_tensor)
+target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance)
+add_dependencies(test_convnd_fwd test_conv2d_fwd)
+
+add_test_executable(test_conv3d_fwd conv3d_fwd.cpp)
+target_link_libraries(test_conv3d_fwd PRIVATE host_tensor)
+target_link_libraries(test_conv3d_fwd PRIVATE device_conv3d_fwd_instance)
+add_dependencies(test_convnd_fwd test_conv3d_fwd)
+