Merge branch 'develop' into lwpck-471

05ee41c3 · Rosty Geyyer · 37116c98 · ad541ad6 · 05ee41c3 · 05ee41c3
Commit 05ee41c3 authored Nov 30, 2022 by Rosty Geyyer
20 changed files
--- a/profiler/src/profile_batchnorm_fwd.cpp
+++ b/profiler/src/profile_batchnorm_fwd.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <vector>
+#include <getopt.h>
+#include "ck/library/utility/host_common_util.hpp"
+#include "profiler/include/profile_batchnorm_forward_impl.hpp"
+using ck::index_t;
+using namespace std;
+static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                             {"reduceDims", required_argument, nullptr, 'R'},
+                                             {"dumpout", required_argument, nullptr, 'o'},
+                                             {"verify", required_argument, nullptr, 'v'},
+                                             {"help", no_argument, nullptr, '?'},
+                                             {nullptr, 0, nullptr, 0}};
+class BatchnormFwdArgParser
+{
+    private:
+    int option_index = 0;
+    public:
+    std::vector<size_t> inLengths;
+    std::vector<int> reduceDims;
+    bool do_verification = false;
+    bool do_dumpout      = false;
+    bool updateMovingAverage;
+    bool saveMeanAndInvVariance;
+    int data_type    = 0;
+    int init_method  = 2;
+    bool time_kernel = false;
+    BatchnormFwdArgParser()  = default;
+    ~BatchnormFwdArgParser() = default;
+    void show_usage(const char* cmd)
+    {
+        // clang-format off
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl;  
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)" << std::endl;
+        std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance (0=no, 1=yes)" << std::endl;
+        std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
+        std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
+        // clang-format on
+    };
+    int operator()(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+        int ch;
+        optind++; // to skip the module name
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case 'o':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+                do_dumpout = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return -1;
+                };
+                break;
+            default:
+                show_usage(argv[0]);
+                std::cerr << "Invalid cmd-line options!" << std::endl;
+                return -1;
+            };
+        };
+        if(optind + 5 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+        data_type              = std::atoi(argv[optind++]);
+        updateMovingAverage    = std::atoi(argv[optind++]);
+        saveMeanAndInvVariance = std::atoi(argv[optind++]);
+        init_method            = std::atoi(argv[optind++]);
+        time_kernel            = static_cast<bool>(std::atoi(argv[optind++]));
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return -1;
+        return 0;
+    };
+}; // end of class AppArgs
+static const double epsilon       = std::numeric_limits<float>::epsilon();
+static const double averageFactor = 0.1;
+int profile_batchnorm_forward(int argc, char* argv[])
+{
+    using ck::profiler::profile_batchnorm_forward_impl;
+    BatchnormFwdArgParser arg_parser;
+    if(arg_parser(argc, argv) != 0)
+        return -1;
+    using F16  = ck::half_t;
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using F64  = double;
+    if(arg_parser.data_type == 0)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_forward_impl<F16, F16, F32, F16, F16, F16, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.updateMovingAverage,
+                arg_parser.saveMeanAndInvVariance,
+                epsilon,
+                averageFactor);
+        };
+    }
+    else if(arg_parser.data_type == 1)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_forward_impl<F32, F32, F32, F32, F32, F32, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.updateMovingAverage,
+                arg_parser.saveMeanAndInvVariance,
+                epsilon,
+                averageFactor);
+        };
+    }
+    else if(arg_parser.data_type == 5)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_forward_impl<BF16, BF16, F32, BF16, BF16, F32, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.updateMovingAverage,
+                arg_parser.saveMeanAndInvVariance,
+                epsilon,
+                averageFactor);
+        };
+    }
+    else if(arg_parser.data_type == 6)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_forward_impl<F64, F64, F64, F64, F64, F64, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.updateMovingAverage,
+                arg_parser.saveMeanAndInvVariance,
+                epsilon,
+                averageFactor);
+        };
+    }
+    return 0;
+}
--- a/profiler/src/profile_conv_bwd_weight.cpp
+++ b/profiler/src/profile_conv_bwd_weight.cpp
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <initializer_list>
 #include <iostream>
 #include <numeric>
-#include <initializer_list>
-#include <cstdlib>
-#include "profiler/include/profile_conv_bwd_weight_impl.hpp"
+#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
 namespace {
 enum struct ConvLayout
 {
-    NCHW_KCYX_NKHW, // 0
+    GNCHW_GKCYX_GNKHW, // 0
-    NHWC_KYXC_NHWK, // 1
+    GNHWC_GKYXC_GNHWK, // 1
 };
 enum struct ConvDataType
@@ -25,24 +25,25 @@ enum struct ConvDataType
 static void print_helper_msg()
 {
-    std::cout
+    std::cout << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
-        << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
+              << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
-        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+              << "                 1: Input fp16, Weight fp16, Output fp16\n"
-        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+              << "                 2: Input bf16, Weight fp32, Output bf16)\n"
-        << "                 2: Input bf16, Weight fp32, Output bf16)\n"
+              << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
-        << "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
+                 "N, K, Ho, Wo]\n"
-        << "                     1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, K]\n"
+              << "                     1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
-        << "arg4: verification (0: no, 1: yes)\n"
+                 "N, Ho, Wo, K]\n"
-        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+              << "arg4: verification (0: no, 1: yes)\n"
-        << "arg6: print tensor value (0: no; 1: yes)\n"
+              << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
-        << "arg7: time kernel (0: no, 1: yes)\n"
+              << "arg6: print tensor value (0: no; 1: yes)\n"
-        << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
+              << "arg7: time kernel (0: no, 1: yes)\n"
-        << std::endl;
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
+              << std::endl;
 }
 } // namespace
-int profile_conv_bwd_weight(int argc, char* argv[])
+int profile_grouped_conv_bwd_weight(int argc, char* argv[])
 {
    // 8 for control, 1 for num_dim_spatial
    if(argc < 9)
@@ -75,17 +76,17 @@ int profile_conv_bwd_weight(int argc, char* argv[])
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
-    using NWC   = ck::tensor_layout::convolution::NWC;
+    using GNWC   = ck::tensor_layout::convolution::GNWC;
-    using NHWC  = ck::tensor_layout::convolution::NHWC;
+    using GNHWC  = ck::tensor_layout::convolution::GNHWC;
-    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+    using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
-    using KXC   = ck::tensor_layout::convolution::KXC;
+    using GKXC   = ck::tensor_layout::convolution::GKXC;
-    using KYXC  = ck::tensor_layout::convolution::KYXC;
+    using GKYXC  = ck::tensor_layout::convolution::GKYXC;
-    using KZYXC = ck::tensor_layout::convolution::KZYXC;
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
-    using NWK   = ck::tensor_layout::convolution::NWK;
+    using GNWK   = ck::tensor_layout::convolution::GNWK;
-    using NHWK  = ck::tensor_layout::convolution::NHWK;
+    using GNHWK  = ck::tensor_layout::convolution::GNHWK;
-    using NDHWK = ck::tensor_layout::convolution::NDHWK;
+    using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
    constexpr auto I1 = ck::Number<1>{};
    constexpr auto I2 = ck::Number<2>{};
@@ -108,64 +109,64 @@ int profile_conv_bwd_weight(int argc, char* argv[])
        using WeiDataType = decltype(wei_type);
        using OutDataType = decltype(out_type);
-        bool pass = ck::profiler::profile_conv_bwd_weight_impl<NDimSpatial,
+        bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
-                                                               InLayout,
+                                                                       InLayout,
-                                                               WeiLayout,
+                                                                       WeiLayout,
-                                                               OutLayout,
+                                                                       OutLayout,
-                                                               InDataType,
+                                                                       InDataType,
-                                                               WeiDataType,
+                                                                       WeiDataType,
-                                                               OutDataType>(
+                                                                       OutDataType>(
            do_verification, init_method, do_log, time_kernel, params, split_k);
        return pass ? 0 : 1;
    };
-    if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, F32{}, BF16{});
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, F32{}, BF16{});
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{});
        }
    }
-    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
        if(data_type == ConvDataType::F32_F32_F32)
        {
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{});
        }
        else if(data_type == ConvDataType::F16_F16_F16)
        {
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{});
        }
        else if(data_type == ConvDataType::BF16_F32_BF16)
        {
            // fp32 atomic add is used for weight tensor in bf16 kernel
-            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, F32{}, BF16{});
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
        }
    }

--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -8,14 +8,10 @@
 #include "profiler/include/profile_softmax_impl.hpp"
 using ck::index_t;
-using ck::profiler::NormDataType;
+using ck::profiler::SoftmaxDataType;
-using ck::profiler::NormType;
 struct ArgParser
 {
-    std::unordered_map<std::string, NormType> norm_dict = {{"batchnorm", NormType::BATCHNORM},
-                                                           {"softmax", NormType::SOFTMAX}};
    std::unordered_map<std::string, std::vector<int>> long_opts = {
        {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
@@ -50,7 +46,7 @@ struct ArgParser
 void print_help()
 {
-    std::cout << "arg1: tensor operation (batchnorm/softmax)\n"
+    std::cout << "arg1: tensor operation (softmax)\n"
              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
              << "arg3: verification (0: no; 1: yes)\n"
              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
@@ -64,7 +60,7 @@ void print_help()
              << std::endl;
 }
-int profile_normalization(int argc, char* argv[])
+int profile_softmax(int argc, char* argv[])
 {
    if(argc <= 2)
    {
@@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[])
    ArgParser arg_parser;
    // short unnamed options
-    const NormType norm_type     = arg_parser.norm_dict[argv[1]];
+    const SoftmaxDataType data_type = static_cast<SoftmaxDataType>(std::stoi(argv[2]));
-    const NormDataType data_type = static_cast<NormDataType>(std::stoi(argv[2]));
+    const bool do_verification      = std::stoi(argv[3]);
-    const bool do_verification   = std::stoi(argv[3]);
+    const int init_method           = std::stoi(argv[4]);
-    const int init_method        = std::stoi(argv[4]);
+    const bool do_log               = std::stoi(argv[5]);
-    const bool do_log            = std::stoi(argv[5]);
+    const bool time_kernel          = std::stoi(argv[6]);
-    const bool time_kernel       = std::stoi(argv[6]);
    // parse the long options
    arg_parser(argc, argv);
@@ -91,9 +86,10 @@ int profile_normalization(int argc, char* argv[])
        arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
    const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
+    // Rank 3
    if(length.size() == 3)
    {
-        if(data_type == NormDataType::F16_F16)
+        if(data_type == SoftmaxDataType::F16_F16)
        {
            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
                                                                                 init_method,
@@ -103,10 +99,9 @@ int profile_normalization(int argc, char* argv[])
                                                                                 stride,
                                                                                 reduce,
                                                                                 float(alpha),
-                                                                                 float(beta),
+                                                                                 float(beta));
-                                                                                 norm_type);
        }
-        else if(data_type == NormDataType::F32_F32)
+        else if(data_type == SoftmaxDataType::F32_F32)
        {
            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
                                                                       init_method,
@@ -116,17 +111,17 @@ int profile_normalization(int argc, char* argv[])
                                                                       stride,
                                                                       reduce,
                                                                       float(alpha),
-                                                                       float(beta),
+                                                                       float(beta));
-                                                                       norm_type);
        }
        else
        {
            throw std::runtime_error("not implemented yet");
        }
    }
+    // Rank 4
    else if(length.size() == 4)
    {
-        if(data_type == NormDataType::F16_F16)
+        if(data_type == SoftmaxDataType::F16_F16)
        {
            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
                                                                                 init_method,
@@ -136,10 +131,9 @@ int profile_normalization(int argc, char* argv[])
                                                                                 stride,
                                                                                 reduce,
                                                                                 float(alpha),
-                                                                                 float(beta),
+                                                                                 float(beta));
-                                                                                 norm_type);
        }
-        else if(data_type == NormDataType::F32_F32)
+        else if(data_type == SoftmaxDataType::F32_F32)
        {
            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
                                                                       init_method,
@@ -149,8 +143,7 @@ int profile_normalization(int argc, char* argv[])
                                                                       stride,
                                                                       reduce,
                                                                       float(alpha),
-                                                                       float(beta),
+                                                                       float(beta));
-                                                                       norm_type);
        }
        else
        {

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -18,12 +18,14 @@ int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
 int profile_conv_bwd_data(int, char*[]);
-int profile_conv_bwd_weight(int, char*[]);
 int profile_grouped_conv_fwd(int, char*[]);
-int profile_normalization(int, char*[]);
+int profile_grouped_conv_bwd_weight(int, char*[]);
+int profile_softmax(int, char*[]);
 int profile_layernorm(int, char*[]);
 int profile_groupnorm(int, char*[]);
 int profile_reduce(int, char*[]);
+int profile_batchnorm_forward(int, char*[]);
+int profile_batchnorm_backward(int, char*[]);
 static void print_helper_message()
 {
@@ -43,9 +45,11 @@ static void print_helper_message()
           "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
           "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
           "                        conv_bwd_data: Convolution Backward Data\n"
-           "                        conv_bwd_weight: Convolution Backward Weight\n"
           "                        grouped_conv_fwd: Grouped Convolution Forward\n"
-           "                        reduce: Reduce\n");
+           "                        grouped_conv_bwd_weight: Grouped Convolution Backward Weight\n"
+           "                        softmax: Softmax\n"
+           "                        reduce: Reduce\n"
+	   "                        bnorm_fwd: Batchnorm forward\n");
    // clang-format on
 }
@@ -117,21 +121,21 @@ int main(int argc, char* argv[])
    {
        return profile_conv_bwd_data(argc, argv);
    }
-    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
-    {
-        return profile_conv_bwd_weight(argc, argv);
-    }
    else if(strcmp(argv[1], "grouped_conv_fwd") == 0)
    {
        return profile_grouped_conv_fwd(argc, argv);
    }
+    else if(strcmp(argv[1], "conv_bwd_weight") == 0)
+    {
+        return profile_grouped_conv_bwd_weight(argc, argv);
+    }
    else if(strcmp(argv[1], "reduce") == 0)
    {
        return profile_reduce(argc, argv);
    }
-    else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0)
+    else if(strcmp(argv[1], "softmax") == 0)
    {
-        return profile_normalization(argc, argv);
+        return profile_softmax(argc, argv);
    }
    else if(strcmp(argv[1], "layernorm") == 0)
    {
@@ -141,6 +145,14 @@ int main(int argc, char* argv[])
    {
        return profile_groupnorm(argc, argv);
    }
+    else if(strcmp(argv[1], "bnorm_fwd") == 0)
+    {
+        return profile_batchnorm_forward(argc, argv);
+    }
+    else if(strcmp(argv[1], "bnorm_bwd") == 0)
+    {
+        return profile_batchnorm_backward(argc, argv);
+    }
    else
    {
        print_helper_message();

--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD"         \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=ON                                                                                   \
-D GPU_TARGETS=gfx908;gfx90a                                                                      \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -11,7 +11,7 @@ cmake
 -D CMAKE_CXX_FLAGS="-O3"                                                                          \
 -D CMAKE_BUILD_TYPE=Release                                                                       \
 -D BUILD_DEV=OFF                                                                                  \
-D GPU_TARGETS=gfx908;gfx90a                                                                      \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                      \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
 ${MY_PROJECT_SOURCE}

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -26,7 +26,7 @@ function(add_gtest_executable TEST_NAME)
    # suppress gtest warnings
    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
    target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
-    gtest_discover_tests(${TEST_NAME})
+    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> )
    rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
 endfunction(add_gtest_executable TEST_NAME)
@@ -45,10 +45,12 @@ add_subdirectory(batched_gemm_softmax_gemm_permute)
 add_subdirectory(grouped_gemm)
 add_subdirectory(reduce)
 add_subdirectory(convnd_fwd)
-add_subdirectory(convnd_bwd_weight)
 add_subdirectory(convnd_bwd_data)
 add_subdirectory(grouped_convnd_fwd)
+add_subdirectory(grouped_convnd_bwd_weight)
 add_subdirectory(block_to_ctile_map)
 add_subdirectory(softmax)
 add_subdirectory(normalization)
 add_subdirectory(data_type)
+add_subdirectory(elementwise_normalization)
+add_subdirectory(batchnorm)
--- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
 add_custom_target(test_batched_gemm_softmax_gemm_permute)
 add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
+add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
 target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
-add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
\ No newline at end of file
+add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
\ No newline at end of file
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "test_batched_gemm_softmax_gemm_permute_util.hpp"
+template <typename Tuple>
+class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
+    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
+{
+};
+using I1_t = ck::Number<1>;
+using I2_t = ck::Number<2>;
+using MaskDisabled_t =
+    ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskDisabled>;
+using MaskOutUpperTriangle_t =
+    ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskOutUpperTriangle>;
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, BF16, BF16, BF16, BF16, ck::Tuple<>, ck::Tuple<>, MaskDisabled_t>,
+    std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, BF16, BF16, BF16, BF16, ck::Tuple<>, ck::Tuple<>, MaskOutUpperTriangle_t>
+    >;
+// clang-format on
+TYPED_TEST_SUITE(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, KernelTypes);
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16) { this->Run(); }
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 3, 2},
+    };
+    this->Run();
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 2, 4},
+        {128, 128, 136, 128, 4, 2},
+    };
+    this->Run();
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1, 3},
+    };
+    this->Run();
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 4, 3},
+    };
+    this->Run();
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 2, 3},
+        {128, 128, 129, 128, 2, 3},
+    };
+    this->Run();
+}
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 2, 3},
+    };
+    this->Run();
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Bench_BF16_IrregularK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{{256, 256, 160, 160, 1, 16},
+                                                   {256, 64, 160, 64, 1, 16},
+                                                   {1024, 1024, 80, 80, 1, 16},
+                                                   {1024, 64, 80, 64, 1, 16},
+                                                   {4096, 4096, 40, 40, 1, 16},
+                                                   {4096, 64, 40, 64, 1, 16}};
+    this->bench_   = true;
+    this->verify_  = false;
+    this->Run();
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Bench_BF16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 48, 16},
+        {256, 256, 128, 128, 48, 16},
+        {512, 512, 64, 64, 48, 16},
+        {512, 512, 128, 128, 48, 16},
+        {1024, 1024, 64, 64, 48, 16},
+        {1024, 1024, 128, 128, 48, 16},
+        {2048, 2048, 64, 64, 48, 16},
+        {2048, 2048, 128, 128, 48, 16},
+        {4096, 4096, 64, 64, 48, 16},
+        {4096, 4096, 128, 128, 48, 16},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
+using ck::tensor_operation::device::GemmSpecialization;
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMatch)
+{
+    int P = 120; // requires padding
+    int Q = 128; // do not require padding
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
+    // clang-format on
+}
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMismatch)
+{
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
+    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
+    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
+    // clang-format on
+}
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {49, 49, 64, 64, 4, 6},
+        {64, 49, 64, 64, 4, 6},
+        {1020, 1020, 64, 128, 4, 6},
+        {576, 576, 64, 64, 4, 6},
+    };
+    this->Run();
+}
--- a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -16,7 +16,8 @@ using ck::tensor_operation::device::TensorSpecialization;
 template <ck::index_t N>
 using I = ck::Number<N>;
-using F16 = ck::half_t;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -63,7 +64,7 @@ struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
                                                                         ck::Tuple<>,
                                                                         ck::Tuple<>,
                                                                         MaskingType::value>(
-                verify_, 1, false, bench_, M, N, K, O, G0, G1);
+                verify_, 2, false, bench_, M, N, K, O, G0, G1);
        EXPECT_TRUE(pass);
    }
@@ -224,3 +225,144 @@ struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128
        return gemm.IsSupportedArgument(argument);
    }
 };
+template <GemmSpecialization GemmSpec>
+struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Scale       = ck::tensor_operation::element_wise::Scale;
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+    using ADataType        = BF16;
+    using B0DataType       = BF16;
+    using B1DataType       = BF16;
+    using AccDataType      = float;
+    using CShuffleDataType = BF16;
+    using CDataType        = BF16;
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
+    using DeviceGemmGemmInstance =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+            2,
+            1,
+            1,
+            1,
+            1,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            ck::Tuple<>,
+            ck::Tuple<>,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            TensorSpecialization::Default, // ATensorSpec
+            TensorSpecialization::Default, // B0TensorSpec
+            TensorSpecialization::Default, // B1TensorSpec
+            TensorSpecialization::Default, // CTensorSpec
+            1,
+            256,
+            128,         // MPerBlock
+            128,         // NPerBlock
+            32,          // KPerBlock
+            128,         // Gemm1NPerBlock
+            32,          // Gemm1KPerBlock
+            8,           // AK1
+            8,           // BK1
+            2,           // B1K1
+            32,          // MPerXDL
+            32,          // NPerXDL
+            1,           // MXdlPerWave
+            4,           // NXdlPerWave
+            4,           // Gemm1NXdlPerWave
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            4,
+            2,
+            false,
+            1,              // CShuffleMXdlPerWavePerShuffle
+            2,              // CShuffleNXdlPerWavePerShuffle
+            S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            MaskingSpecialization::MaskOutUpperTriangle>; // MaskOutUpperTriangle
+    bool IsSupported(int M, int N, int K, int O)
+    {
+        const int G0 = 1, G1 = 1;
+        // A layout [G0, M, G1, K]
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+        // B0 layout [G0, N, G1, K]
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+        // B1 layout [G0, N, G1, O]
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+        // C layout [G0, M, G1, O]
+        std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+        auto gemm     = DeviceGemmGemmInstance{};
+        auto invoker  = gemm.MakeInvoker();
+        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
+                                          static_cast<B0DataType*>(nullptr),
+                                          static_cast<B1DataType*>(nullptr),
+                                          static_cast<CDataType*>(nullptr),
+                                          {}, // p_acc0_biases
+                                          {}, // p_acc1_biases
+                                          a_gs_ms_ks_lengths,
+                                          a_gs_ms_ks_strides,
+                                          b0_gs_ns_ks_lengths,
+                                          b0_gs_ns_ks_strides,
+                                          b1_gs_os_ns_lengths,
+                                          b1_gs_os_ns_strides,
+                                          c_gs_ms_os_lengths,
+                                          c_gs_ms_os_strides,
+                                          {},             // acc0_biases_gs_ms_ns_lengths
+                                          {},             // acc0_biases_gs_ms_ns_strides
+                                          {},             // acc1_biases_gs_ms_os_lengths
+                                          {},             // acc1_biases_gs_ms_os_strides
+                                          PassThrough{},  // a_element_op
+                                          PassThrough{},  // b0_element_op
+                                          Scale{1.f},     // acc0_element_op
+                                          PassThrough{},  // b1_element_op
+                                          PassThrough{}); // c_element_op
+        return gemm.IsSupportedArgument(argument);
+    }
+};
--- a/test/batchnorm/CMakeLists.txt
+++ b/test/batchnorm/CMakeLists.txt
+add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp)
+add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp)
+target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance)
+target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance)
--- a/test/batchnorm/batchnorm_bwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+#include "profiler/include/profile_batchnorm_backward_impl.hpp"
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using F64  = double;
+template <typename Tuple>
+class TestBatchNormBwdRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon = std::numeric_limits<float>::epsilon();
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using DxDataType      = std::tuple_element_t<1, Tuple>;
+    using DyDataType      = std::tuple_element_t<2, Tuple>;
+    using AccDataType     = std::tuple_element_t<3, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<4, Tuple>;
+    using BiasDataType    = std::tuple_element_t<5, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<6, Tuple>;
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         BiasDataType,
+                                                                         MeanVarDataType,
+                                                                         4,
+                                                                         NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, true, epsilon);
+            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         BiasDataType,
+                                                                         MeanVarDataType,
+                                                                         4,
+                                                                         NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, false, epsilon);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<F16, F32, F32, F32, F16, F32, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, F32, F32, F32, BF16, F32, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64, F64>>;
+TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes);
+// nhwc
+TYPED_TEST(TestBatchNormBwdRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+// nchw
+TYPED_TEST(TestBatchNormBwdRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
--- a/test/batchnorm/batchnorm_fwd_rank_4.cpp
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+#include "profiler/include/profile_batchnorm_forward_impl.hpp"
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F64  = double;
+template <typename Tuple>
+class TestBatchNormFwdRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon       = std::numeric_limits<float>::epsilon();
+    const double averageFactor = 0.1;
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using YDataType       = std::tuple_element_t<1, Tuple>;
+    using AccDataType     = std::tuple_element_t<2, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<3, Tuple>;
+    using BiasDataType    = std::tuple_element_t<4, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<5, Tuple>;
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+            pass =
+                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     4,
+                                                                     NumReduceDim>(true,
+                                                                                   3,
+                                                                                   false,
+                                                                                   false,
+                                                                                   inOutLengths,
+                                                                                   reduceDims,
+                                                                                   true,
+                                                                                   true,
+                                                                                   epsilon,
+                                                                                   averageFactor);
+            pass =
+                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     4,
+                                                                     NumReduceDim>(true,
+                                                                                   3,
+                                                                                   false,
+                                                                                   false,
+                                                                                   inOutLengths,
+                                                                                   reduceDims,
+                                                                                   false,
+                                                                                   false,
+                                                                                   epsilon,
+                                                                                   averageFactor);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);
+// nhwc
+TYPED_TEST(TestBatchNormFwdRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+// nchw
+TYPED_TEST(TestBatchNormFwdRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
--- a/test/convnd_bwd_weight/CMakeLists.txt
+++ b/test/convnd_bwd_weight/CMakeLists.txt
-add_gtest_executable(test_convnd_bwd_weight convnd_bwd_weight.cpp) 
-target_link_libraries(test_convnd_bwd_weight PRIVATE utility device_conv1d_bwd_weight_instance device_conv2d_bwd_weight_instance  device_conv3d_bwd_weight_instance)
--- a/test/elementwise_normalization/CMakeLists.txt
+++ b/test/elementwise_normalization/CMakeLists.txt
+add_custom_target(test_elementwise_normalization)
+add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
+target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance)
+add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16)
--- a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+template <typename Tuple>
+class TestElementwiseLayernorm : public ::testing::Test
+{
+    protected:
+    using ADataType     = std::tuple_element_t<0, Tuple>;
+    using BDataType     = std::tuple_element_t<1, Tuple>;
+    using GammaDataType = std::tuple_element_t<2, Tuple>;
+    using BetaDataType  = std::tuple_element_t<3, Tuple>;
+    using AccDataType   = std::tuple_element_t<4, Tuple>;
+    using YDataType     = std::tuple_element_t<5, Tuple>;
+    void Run()
+    {
+        // M, N
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}};
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_elementwise_layernorm_impl<ADataType,
+                                                                            BDataType,
+                                                                            GammaDataType,
+                                                                            BetaDataType,
+                                                                            AccDataType,
+                                                                            YDataType>(
+                true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+using KernelTypes = ::testing::Types<
+    // ADataType, BDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F16, F16, F16, F16, F32, F16>>;
+TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes);
+TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); }
--- a/test/gemm/gemm_util.hpp
+++ b/test/gemm/gemm_util.hpp
@@ -9,6 +9,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 namespace ck {
@@ -128,15 +129,15 @@ struct TestGemm
    {
        auto f_host_tensor_descriptor =
            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                using namespace ck::literals;
                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                    return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                                std::vector<std::size_t>({stride, 1}));
                }
                else
                {
-                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                    return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                                std::vector<std::size_t>({1, stride}));
                }
            };
@@ -229,27 +230,27 @@ struct TestGemm
            bool res = false;
            if(std::is_same<CDataType, float>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
            else if(std::is_same<CDataType, ck::half_t>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
            else if(std::is_same<CDataType, ck::bhalf_t>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
            else if(std::is_same<CDataType, int8_t>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }
            else if(std::is_same<CDataType, double>::value)
            {
-                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                res = ck::utils::check_err(c_device, c_host);
                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
            }

--- a/test/gemm_split_k/gemm_split_k.cpp
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -16,6 +16,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/utility/host_gemm.hpp"
@@ -93,15 +94,15 @@ int test_gemm(const gemmArgs& args)
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, bool row_major) {
+            using namespace ck::literals;
            if(row_major)
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
-                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                            std::vector<std::size_t>({1, stride}));
            }
        };
@@ -225,9 +226,8 @@ int main(int argc, char* argv[])
    std::vector<gemmArgs> test_cases;
    if(argc == 1)
    {
-        test_cases = {{GemmMatrixLayout::MK_KN_MN, 3, 3, 3, 3, 3, 3, 1}};
+        test_cases = {{GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 2},
-        // JD: Populate with more and meaningful
+                      {GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 8}};
-        return 0;
    }
    else if(argc == 9)
    {
@@ -252,11 +252,10 @@ int main(int argc, char* argv[])
        printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
        return -1;
    }
+    bool error = false;
    for(const auto& kinder : test_cases)
    {
-        const auto res = test_gemm(kinder);
+        error |= test_gemm(kinder);
-        if(!res)
-            return -1;
    }
-    return 0;
+    return error ? 1 : 0;
 }
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
+add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp) 
+target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
--- a/test/convnd_bwd_weight/convnd_bwd_weight.cpp
+++ b/test/convnd_bwd_weight/convnd_bwd_weight.cpp
@@ -4,14 +4,15 @@
 #include <cstdlib>
 #include <iostream>
 #include <initializer_list>
-#include <vector>
 #include <tuple>
+#include <vector>
 #include <gtest/gtest.h>
-#include "profiler/include/profile_conv_bwd_weight_impl.hpp"
+#include "profiler/include/profile_grouped_conv_bwd_weight_impl.hpp"
 template <typename Tuple>
-class TestConvndBwdWeight : public ::testing::Test
+class TestGroupedConvndBwdWeight : public ::testing::Test
 {
    protected:
    using DataType = std::tuple_element_t<0, Tuple>;
@@ -25,20 +26,20 @@ class TestConvndBwdWeight : public ::testing::Test
        {
            bool pass;
            EXPECT_FALSE(conv_params.empty());
-            pass = ck::profiler::profile_conv_bwd_weight_impl<
+            pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<
                NDimSpatial,
                ck::tuple_element_t<NDimSpatial - 1,
-                                    ck::Tuple<ck::tensor_layout::convolution::NWC,
+                                    ck::Tuple<ck::tensor_layout::convolution::GNWC,
-                                              ck::tensor_layout::convolution::NHWC,
+                                              ck::tensor_layout::convolution::GNHWC,
-                                              ck::tensor_layout::convolution::NDHWC>>,
+                                              ck::tensor_layout::convolution::GNDHWC>>,
                ck::tuple_element_t<NDimSpatial - 1,
-                                    ck::Tuple<ck::tensor_layout::convolution::KXC,
+                                    ck::Tuple<ck::tensor_layout::convolution::GKXC,
-                                              ck::tensor_layout::convolution::KYXC,
+                                              ck::tensor_layout::convolution::GKYXC,
-                                              ck::tensor_layout::convolution::KZYXC>>,
+                                              ck::tensor_layout::convolution::GKZYXC>>,
                ck::tuple_element_t<NDimSpatial - 1,
-                                    ck::Tuple<ck::tensor_layout::convolution::NWK,
+                                    ck::Tuple<ck::tensor_layout::convolution::GNWK,
-                                              ck::tensor_layout::convolution::NHWK,
+                                              ck::tensor_layout::convolution::GNHWK,
-                                              ck::tensor_layout::convolution::NDHWK>>,
+                                              ck::tensor_layout::convolution::GNDHWK>>,
                DataType,
                DataType,
                DataType>(true,  // do_verification
@@ -54,37 +55,37 @@ class TestConvndBwdWeight : public ::testing::Test
 using KernelTypes =
    ::testing::Types<std::tuple<float>, std::tuple<ck::half_t>, std::tuple<ck::bhalf_t>>;
-TYPED_TEST_SUITE(TestConvndBwdWeight, KernelTypes);
+TYPED_TEST_SUITE(TestGroupedConvndBwdWeight, KernelTypes);
-TYPED_TEST(TestConvndBwdWeight, Test1D)
+TYPED_TEST(TestGroupedConvndBwdWeight, Test1D)
 {
    this->conv_params.clear();
-    this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
-    this->conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 4, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
-    this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
    this->template Run<1>();
 }
-TYPED_TEST(TestConvndBwdWeight, Test2D)
+TYPED_TEST(TestGroupedConvndBwdWeight, Test2D)
 {
    this->conv_params.clear();
    this->conv_params.push_back(
-        {2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 4, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
    this->conv_params.push_back(
-        {2, 1, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+        {2, 4, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
    this->conv_params.push_back(
-        {2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+        {2, 4, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
    this->template Run<2>();
 }
-TYPED_TEST(TestConvndBwdWeight, Test3D)
+TYPED_TEST(TestGroupedConvndBwdWeight, Test3D)
 {
    this->conv_params.clear();
    this->conv_params.push_back(
-        {3, 1, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 4, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
    this->conv_params.push_back(
-        {3, 1, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+        {3, 4, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
    this->conv_params.push_back(
-        {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+        {3, 4, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
    this->template Run<3>();
 }