Merge branch 'develop' into wavelet_model

7e493730 · Adam Osewski · b89a88b5 · 40942b90 · 7e493730 · 7e493730
Commit 7e493730 authored Oct 13, 2022 by Adam Osewski
14 changed files
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -5,12 +5,11 @@
 # post your new test results to the database and compare them to the baseline
 # please contact Illia.Silin@amd.com for more details
 #
-# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
+# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
 # input arguments: 
 # verification = 0 : do not verify result correctness on CPU
 #              = 1 : verifuy correctness on CPU (may take a long time)
 # environment tag  : a string describing the specifics of your test environment
-# gpu_arch         : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
 # branch name      : name of the branch in git repo (git status | grep -e 'On branch')
 # node name        : $hostname

@@ -19,11 +18,9 @@ export verify=$1
 echo 'Verification: ' $verify
 export env_type=$2
 echo 'Environment type: ' $env_type
-export gpu_arch=$3
-echo 'GPU architecture: ' $gpu_arch
-export branch=$4
+export branch=$3
 echo 'Branch name: ' $branch
-export host_name=$5
+export host_name=$4
 echo 'Host name: ' $host_name
 function print_log_header(){
 	rm -f $1;
@@ -38,7 +35,7 @@ function print_log_header(){
 }

 #run gemm tests
-export gemm_log="perf_gemm_${gpu_arch}.log"
+export gemm_log="perf_gemm.log"
 print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
 ./profile_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
@@ -58,7 +55,7 @@ print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 3 3 $verify 1 0 1 2>&1 | tee -a $gemm_log

 #run batched_gemm tests
-export batched_gemm_log="perf_batched_gemm_${gpu_arch}.log"
+export batched_gemm_log="perf_batched_gemm.log"
 print_log_header $batched_gemm_log $env_type $branch $host_name
 ./profile_batched_gemm.sh batched_gemm 0 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
 ./profile_batched_gemm.sh batched_gemm 0 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
@@ -78,7 +75,7 @@ print_log_header $batched_gemm_log $env_type $branch $host_name
 ./profile_batched_gemm.sh batched_gemm 3 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log

 #run grouped_gemm tests
-export grouped_gemm_log="perf_grouped_gemm_${gpu_arch}.log"
+export grouped_gemm_log="perf_grouped_gemm.log"
 print_log_header $grouped_gemm_log $env_type $branch $host_name
 ./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
 ./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
@@ -86,7 +83,7 @@ print_log_header $grouped_gemm_log $env_type $branch $host_name
 ./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log

 #run GEMM+Bilinear tests
-export gemm_bilinear_log="perf_gemm_bilinear_${gpu_arch}.log"
+export gemm_bilinear_log="perf_gemm_bilinear.log"
 print_log_header $gemm_bilinear_log $env_type $branch $host_name
 ./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
 ./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
@@ -94,7 +91,7 @@ print_log_header $gemm_bilinear_log $env_type $branch $host_name
 ./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log

 #run conv_fwd tests
-export conv_fwd_log="perf_conv_fwd_${gpu_arch}.log"
+export conv_fwd_log="perf_conv_fwd.log"
 print_log_header $conv_fwd_log $env_type $branch $host_name
 ./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
 ./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
@@ -102,7 +99,7 @@ print_log_header $conv_fwd_log $env_type $branch $host_name
 ./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log

 #run conv_bwd_data tests
-export conv_bwd_data_log="perf_conv_bwd_data_${gpu_arch}.log"
+export conv_bwd_data_log="perf_conv_bwd_data.log"
 print_log_header $conv_bwd_data_log $env_type $branch $host_name
 ./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
 ./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
@@ -110,33 +107,43 @@ print_log_header $conv_bwd_data_log $env_type $branch $host_name
 ./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log

 #run resnet50 tests
-export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
+export resnet256_log="perf_resnet50_N256.log"
 print_log_header $resnet256_log $env_type $branch $host_name
 ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 2>&1 | tee -a $resnet256_log
-export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
+export resnet4_log="perf_resnet50_N4.log"
 print_log_header $resnet4_log $env_type $branch $host_name
 ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 2>&1 | tee -a $resnet4_log

 #run reduction tests
-export reduction_log="perf_reduction_${gpu_arch}.log"
+export reduction_log="perf_reduction.log"
 print_log_header $reduction_log $env_type $branch $host_name
 ./profile_reduce_with_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log
 ./profile_reduce_no_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log

-#run splitK_gemm tests
-export splitK_gemm_log="perf_splitK_gemm_${gpu_arch}.log"
+#run splitK_gemm tests, first correctness verification, then performance
+export splitK_gemm_ver_log="perf_splitK_gemm_verify.log"
+print_log_header $splitK_gemm_ver_log $env_type $branch $host_name
+./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+export splitK_gemm_log="perf_splitK_gemm.log"
 print_log_header $splitK_gemm_log $env_type $branch $host_name
-./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
-./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log

 #run ONNX gemm tests
-export onnx_log="perf_onnx_gemm_${gpu_arch}.log"
+export onnx_log="perf_onnx_gemm.log"
 print_log_header $onnx_log $env_type $branch $host_name
 ./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
 ./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
 #!/bin/bash 
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
-# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <gpu_arch> <branch name> < node name>
+# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
 # input arguments: 
 # verification = 0 : do not verify result correctness on CPU
 #              = 1 : verify correctness on CPU (may take a long time)
 # environment tag  : a string describing the specifics of your test environment
-# gpu_arch         : a string for GPU architecture, e.g. "gfx908" or "gfx90a".
 # branch name      : name of the branch in git repo (git status | grep -e 'On branch')
 # node name        : $hostname

@@ -15,11 +14,9 @@ export verify=$1
 echo 'Verification: ' $verify
 export env_type=$2
 echo 'Environment type: ' $env_type
-export gpu_arch=$3
-echo 'GPU architecture: ' $gpu_arch
-export branch=$4
+export branch=$3
 echo 'Branch name: ' $branch
-export host_name=$5
+export host_name=$4
 echo 'Host name: ' $host_name

 function print_log_header(){
@@ -35,7 +32,7 @@ function print_log_header(){
 }

 #run gemm tests
-export gemm_log="perf_gemm_${gpu_arch}.log"
+export gemm_log="perf_gemm.log"
 print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 0 0 $verify 1 0 1 | tee -a $gemm_log
 ./profile_gemm.sh gemm 1 0 $verify 1 0 1 | tee -a $gemm_log
@@ -55,9 +52,9 @@ print_log_header $gemm_log $env_type $branch $host_name
 ./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log

 #run resnet50 tests
-export resnet256_log="perf_resnet50_N256_${gpu_arch}.log"
+export resnet256_log="perf_resnet50_N256.log"
 print_log_header $resnet256_log $env_type $branch $host_name
 ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 | tee -a $resnet256_log
-export resnet4_log="perf_resnet50_N4_${gpu_arch}.log"
+export resnet4_log="perf_resnet50_N4.log"
 print_log_header $resnet4_log $env_type $branch $host_name
 ./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 | tee -a $resnet4_log
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -42,6 +42,7 @@ add_subdirectory(batched_gemm)
 add_subdirectory(batched_gemm_reduce)
 add_subdirectory(batched_gemm_gemm)
 add_subdirectory(batched_gemm_softmax_gemm)
+add_subdirectory(batched_gemm_masking_scale_softmax_gemm_permute)
 add_subdirectory(grouped_gemm)
 add_subdirectory(reduce)
 add_subdirectory(convnd_fwd)

--- a/test/batched_gemm_masking_scale_softmax_gemm_permute/CMakeLists.txt
+++ b/test/batched_gemm_masking_scale_softmax_gemm_permute/CMakeLists.txt
+add_custom_target(test_batched_gemm_masking_scale_softmax_gemm_permute)
+
+add_gtest_executable(test_batched_gemm_masking_scale_softmax_gemm_permute_fp16 test_batched_gemm_masking_scale_softmax_gemm_permute_fp16.cpp)
+target_link_libraries(test_batched_gemm_masking_scale_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_masking_scale_softmax_gemm_permute_instance)
+add_dependencies(test_batched_gemm_masking_scale_softmax_gemm_permute test_batched_gemm_masking_scale_softmax_gemm_permute_fp16)
\ No newline at end of file
--- a/test/batched_gemm_masking_scale_softmax_gemm_permute/test_batched_gemm_masking_scale_softmax_gemm_permute_fp16.cpp
+++ b/test/batched_gemm_masking_scale_softmax_gemm_permute/test_batched_gemm_masking_scale_softmax_gemm_permute_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_masking_scale_softmax_gemm_permute_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
+    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
+{
+};
+
+// clang-format off
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using CPermuteNumDims_G_M_O =
+    S<2, 1, 1>; // "using CLayout = Row" has been replaced by CPermuteNumDims_G_M_O
+using KernelTypes = ::testing::Types<
+    std::tuple<F16, F16, F16, F16, Row, Col, Row, CPermuteNumDims_G_M_O>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16) { this->Run(); }
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 3, 2},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 2, 4},
+        {128, 128, 136, 128, 4, 2},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 4, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 2, 3},
+        {128, 128, 129, 128, 2, 3},
+    };
+    this->Run();
+}
+
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Bench_FP16_IrregularK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{{256, 256, 160, 160, 1, 16},
+                                                   {256, 64, 160, 64, 1, 16},
+                                                   {1024, 1024, 80, 80, 1, 16},
+                                                   {1024, 64, 80, 64, 1, 16},
+                                                   {4096, 4096, 40, 40, 1, 16},
+                                                   {4096, 64, 40, 64, 1, 16}};
+    this->bench_   = true;
+    this->verify_  = false;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, DISABLED_Bench_FP16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 48, 16},
+        {256, 256, 128, 128, 48, 16},
+        {512, 512, 64, 64, 48, 16},
+        {512, 512, 128, 128, 48, 16},
+        {1024, 1024, 64, 64, 48, 16},
+        {1024, 1024, 128, 128, 48, 16},
+        {2048, 2048, 64, 64, 48, 16},
+        {2048, 2048, 128, 128, 48, 16},
+        {4096, 4096, 64, 64, 48, 16},
+        {4096, 4096, 128, 128, 48, 16},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
+
+using ck::tensor_operation::device::GemmSpecialization;
+
+// TODO: enable KPadding tests when it is implemented
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMatch)
+{
+    int P = 120; // requires padding
+    int Q = 128; // do not require padding
+
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
+    // clang-format on
+}
+
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMismatch)
+{
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
+    // EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
+    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
+    // EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
+    // EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
+    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
+    // EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
+    // clang-format on
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {49, 49, 64, 64, 4, 6},
+        {64, 49, 64, 64, 4, 6},
+        {1020, 1020, 64, 128, 4, 6},
+        {576, 576, 64, 64, 4, 6},
+    };
+    this->bench_ = true;
+    this->Run();
+}
--- a/test/batched_gemm_masking_scale_softmax_gemm_permute/test_batched_gemm_masking_scale_softmax_gemm_permute_util.hpp
+++ b/test/batched_gemm_masking_scale_softmax_gemm_permute/test_batched_gemm_masking_scale_softmax_gemm_permute_util.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include <vector>
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "profiler/include/profile_batched_gemm_masking_scale_softmax_gemm_permute_impl.hpp"
+using ck::tensor_operation::device::GemmSpecialization;
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename Tuple>
+struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
+{
+    using ADataType             = std::tuple_element_t<0, Tuple>;
+    using B0DataType            = std::tuple_element_t<1, Tuple>;
+    using B1DataType            = std::tuple_element_t<2, Tuple>;
+    using CDataType             = std::tuple_element_t<3, Tuple>;
+    using ALayout               = std::tuple_element_t<4, Tuple>;
+    using B0Layout              = std::tuple_element_t<5, Tuple>;
+    using B1Layout              = std::tuple_element_t<6, Tuple>;
+    using CPermuteNumDims_G_M_O = std::tuple_element_t<7, Tuple>;
+
+    std::vector<std::vector<int>> lengths_ = {
+        {256, 256, 64, 64, 6, 4},
+        {256, 256, 128, 128, 4, 6},
+        {512, 512, 64, 64, 3, 2},
+        {512, 512, 128, 128, 2, 3},
+        {1024, 1024, 64, 64, 3, 1},
+        {1024, 1024, 128, 128, 1, 1},
+    };
+    bool bench_  = false;
+    bool verify_ = true;
+
+    void RunSingle(int M, int N, int K, int O, int G0, int G1)
+    {
+        bool pass = ck::profiler::profile_batched_gemm_masking_scale_softmax_gemm_permute_impl<
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            ALayout,
+            B0Layout,
+            B1Layout,
+            CPermuteNumDims_G_M_O>(verify_, 1, false, bench_, M, N, K, O, G0, G1);
+
+        EXPECT_TRUE(pass);
+    }
+
+    void Run()
+    {
+        for(auto lengths : this->lengths_)
+        {
+            int M  = lengths[0];
+            int N  = lengths[1];
+            int K  = lengths[2];
+            int O  = lengths[3];
+            int G0 = lengths[4];
+            int G1 = lengths[5];
+
+            this->RunSingle(M, N, K, O, G0, G1);
+        }
+    }
+};
+
+template <GemmSpecialization GemmSpec>
+struct DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Scale       = ck::tensor_operation::element_wise::Scale;
+
+    using ALayout  = Row;
+    using B0Layout = Col;
+    using B1Layout = Row;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+    using CPermuteNumDims_G_M_O =
+        S<2, 1, 1>; // "using CLayout = Row" has been replaced by CPermuteNumDims_G_M_O
+
+    using ADataType        = F16;
+    using B0DataType       = F16;
+    using B1DataType       = F16;
+    using AccDataType      = float;
+    using CShuffleDataType = F16;
+    using CDataType        = F16;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
+
+    using DeviceGemmGemmInstance =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+            ALayout,
+            B0Layout,
+            B1Layout,
+            CPermuteNumDims_G_M_O,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            1,
+            256,
+            128,         // MPerBlock
+            128,         // NPerBlock
+            32,          // KPerBlock
+            128,         // Gemm1NPerBlock
+            32,          // Gemm1KPerBlock
+            8,           // AK1
+            8,           // BK1
+            2,           // B1K1
+            32,          // MPerXDL
+            32,          // NPerXDL
+            1,           // MXdlPerWave
+            4,           // NXdlPerWave
+            4,           // Gemm1NXdlPerWave
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            4,
+            2,
+            false,
+            1,              // CShuffleMXdlPerWavePerShuffle
+            2,              // CShuffleNXdlPerWavePerShuffle
+            S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            true>;          // Masking
+
+    bool IsSupported(int M, int N, int K, int O)
+    {
+        auto gemm     = DeviceGemmGemmInstance{};
+        auto invoker  = gemm.MakeInvoker();
+        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
+                                          static_cast<B0DataType*>(nullptr),
+                                          static_cast<B1DataType*>(nullptr),
+                                          static_cast<CDataType*>(nullptr),
+                                          M,
+                                          N,
+                                          K,
+                                          O,
+                                          0,              // BatchCount
+                                          {0, 0, M, O},   // gs ms ns lengths
+                                          {0, O, 0, 1},   // gs ms ns strides
+                                          0,              // StrideA
+                                          0,              // StrideB0
+                                          0,              // StrideB1
+                                          0,              // BatchStrideA
+                                          0,              // BatchStrideB0
+                                          0,              // BatchStrideB1
+                                          PassThrough{},  // a_element_op
+                                          PassThrough{},  // b0_element_op
+                                          Scale{1.f},     // acc0_element_op
+                                          PassThrough{},  // b1_element_op
+                                          PassThrough{}); // c_element_op
+
+        return gemm.IsSupportedArgument(argument);
+    }
+};
--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
@@ -105,6 +105,19 @@ TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, DISABLED_Bench_FP16)
    this->Run();
 }

+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, DISABLED_Bench_FP16_IrregularK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{{256, 256, 160, 160, 16},
+                                                   {256, 64, 160, 64, 16},
+                                                   {1024, 1024, 80, 80, 16},
+                                                   {1024, 64, 80, 64, 16},
+                                                   {4096, 4096, 40, 40, 16},
+                                                   {4096, 64, 40, 64, 16}};
+    this->bench_   = true;
+    this->verify_  = false;
+    this->Run();
+}
+
 using ck::tensor_operation::device::GemmSpecialization;

 // TODO: enable KPadding tests when it is implemented
@@ -118,19 +131,19 @@ TEST(TestBatchedGemmSoftmaxGemmInterface, GemmSpecializationSizeMatch)
    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
-    // EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
-    // EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
-    // EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
-    // EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
-    // EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
-    // EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
-    // EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
-    // EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
    // clang-format on
 }


--- a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -29,14 +29,19 @@ struct TestBatchedGemmSoftmaxGemm : public ::testing::Test
    using B1Layout   = std::tuple_element_t<6, Tuple>;
    using CLayout    = std::tuple_element_t<7, Tuple>;

-    std::vector<std::vector<int>> lengths_ = {
-        {256, 256, 64, 64, 4},
-        {256, 256, 128, 128, 4},
-        {512, 512, 64, 64, 2},
-        {512, 512, 128, 128, 2},
-        {1024, 1024, 64, 64, 1},
-        {1024, 1024, 128, 128, 1},
-    };
+    std::vector<std::vector<int>> lengths_ = {{256, 256, 64, 64, 4},
+                                              {256, 256, 128, 128, 4},
+                                              {512, 512, 64, 64, 2},
+                                              {512, 512, 128, 128, 2},
+                                              {1024, 1024, 64, 64, 1},
+                                              {1024, 1024, 128, 128, 1},
+                                              {256, 256, 160, 160, 4},
+                                              {256, 64, 160, 64, 4},
+                                              {1024, 1024, 80, 80, 2},
+                                              {1024, 64, 80, 64, 2},
+                                              {4096, 4096, 40, 40, 1},
+                                              {4096, 64, 40, 64, 1}};
+
    bool bench_  = false;
    bool verify_ = true;

@@ -155,7 +160,8 @@ struct DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128
            1,              // CShuffleMXdlPerWavePerShuffle
            2,              // CShuffleNXdlPerWavePerShuffle
            S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
-            8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            false>;

    bool IsSupported(int M, int N, int K, int O)
    {

--- a/test/layernorm/CMakeLists.txt
+++ b/test/layernorm/CMakeLists.txt
 add_custom_target(test_layernorm)

-add_gtest_executable(test_layernorm_fp32 test_layernorm_fp32.cpp)
-add_gtest_executable(test_layernorm_fp16 test_layernorm_fp16.cpp)
+add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
+add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp)
+add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
+add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp)

-target_link_libraries(test_layernorm_fp32 PRIVATE utility)
-target_link_libraries(test_layernorm_fp16 PRIVATE utility)
+target_link_libraries(test_layernorm2d_fp32 PRIVATE utility)
+target_link_libraries(test_layernorm2d_fp16 PRIVATE utility)
+target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
+target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
+
+add_dependencies(test_layernorm test_layernorm2d_fp32)
+add_dependencies(test_layernorm test_layernorm2d_fp16)
+add_dependencies(test_layernorm test_groupnorm_fp16)
+add_dependencies(test_layernorm test_groupnorm_fp32)

-add_dependencies(test_layernorm test_layernorm_fp32)
-add_dependencies(test_layernorm test_layernorm_fp16)
--- a/test/layernorm/test_groupnorm_fp16.cpp
+++ b/test/layernorm/test_groupnorm_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/include/profile_groupnorm_impl.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+
+template <typename Tuple>
+class TestGroupnorm : public ::testing::Test
+{
+    protected:
+    using XDataType     = std::tuple_element_t<0, Tuple>;
+    using GammaDataType = std::tuple_element_t<1, Tuple>;
+    using BetaDataType  = std::tuple_element_t<2, Tuple>;
+    using AccDataType   = std::tuple_element_t<3, Tuple>;
+    using YDataType     = std::tuple_element_t<4, Tuple>;
+
+    void Run()
+    {
+        // N, H, W, G, C
+        std::vector<std::vector<ck::index_t>> lengths = {{1, 1, 1, 1, 1},
+                                                         {1, 2, 3, 4, 5},
+                                                         {256, 9, 9, 9, 9},
+                                                         {1, 64, 64, 32, 10},
+                                                         {1, 32, 32, 32, 20},
+                                                         {2, 32, 32, 32, 30},
+                                                         {2, 32, 32, 32, 40},
+                                                         {1, 16, 16, 32, 40}};
+
+        for(auto length : lengths)
+        {
+            bool success =
+                ck::profiler::profile_groupnorm_impl<XDataType,
+                                                     GammaDataType,
+                                                     BetaDataType,
+                                                     AccDataType,
+                                                     YDataType>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F16, F16, F16, F32, F16>,
+    std::tuple<F16, F16, F16, F32, F16>,
+    std::tuple<F16, F16, F16, F32, F16>,
+    std::tuple<F16, F16, F16, F32, F16>,
+    std::tuple<F16, F16, F16, F32, F16>,
+    std::tuple<F16, F16, F16, F32, F16>,
+    std::tuple<F16, F16, F16, F32, F16>,
+    std::tuple<F16, F16, F16, F32, F16>>;
+
+TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
+TYPED_TEST(TestGroupnorm, Test_FP16) { this->Run(); }
--- a/test/layernorm/test_groupnorm_fp32.cpp
+++ b/test/layernorm/test_groupnorm_fp32.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/include/profile_groupnorm_impl.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+
+template <typename Tuple>
+class TestGroupnorm : public ::testing::Test
+{
+    protected:
+    using XDataType     = std::tuple_element_t<0, Tuple>;
+    using GammaDataType = std::tuple_element_t<1, Tuple>;
+    using BetaDataType  = std::tuple_element_t<2, Tuple>;
+    using AccDataType   = std::tuple_element_t<3, Tuple>;
+    using YDataType     = std::tuple_element_t<4, Tuple>;
+
+    void Run()
+    {
+        // N, H, W, G, C
+        std::vector<std::vector<ck::index_t>> lengths = {{1, 1, 1, 1, 1},
+                                                         {1, 2, 3, 4, 5},
+                                                         {256, 9, 9, 9, 9},
+                                                         {1, 64, 64, 32, 10},
+                                                         {1, 32, 32, 32, 20},
+                                                         {1, 16, 16, 32, 40}};
+
+        for(auto length : lengths)
+        {
+            bool success =
+                ck::profiler::profile_groupnorm_impl<XDataType,
+                                                     GammaDataType,
+                                                     BetaDataType,
+                                                     AccDataType,
+                                                     YDataType>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F32, F32, F32, F32, F32>,
+    std::tuple<F32, F32, F32, F32, F32>,
+    std::tuple<F32, F32, F32, F32, F32>,
+    std::tuple<F32, F32, F32, F32, F32>,
+    std::tuple<F32, F32, F32, F32, F32>,
+    std::tuple<F32, F32, F32, F32, F32>,
+    std::tuple<F32, F32, F32, F32, F32>,
+    std::tuple<F32, F32, F32, F32, F32>>;
+
+TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
+TYPED_TEST(TestGroupnorm, Test_FP32) { this->Run(); }
--- a/test/layernorm/test_layernorm_fp16.cpp
+++ b/test/layernorm/test_layernorm_fp16.cpp
@@ -2,28 +2,28 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "test_layernorm_util.hpp"
+#include "test_layernorm2d_util.hpp"

 template <ck::index_t N>
 using I = ck::Number<N>;

 template <typename Tuple>
-class TestLayernormFP16 : public ck::TestLayernorm<Tuple>
+class TestLayernorm2dFP16 : public ck::TestLayernorm2d<Tuple>
 {
 };

 // clang-format off
 using KernelTypes = ::testing::Types<
-//  XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>,
-    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<8>, I<8>, I<8>, I<8>>
+//  XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim , GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>,
+    std::tuple<ck::half_t, ck::half_t, ck::half_t, float, ck::half_t, I<2>, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<8>, I<1>, I<8>, I<1>, I<8>, I<8>>
    >;
 // clang-format on
-TYPED_TEST_SUITE(TestLayernormFP16, KernelTypes);
-TYPED_TEST(TestLayernormFP16, Test_FP16) { this->Run(); }
+TYPED_TEST_SUITE(TestLayernorm2dFP16, KernelTypes);
+TYPED_TEST(TestLayernorm2dFP16, Test_FP16) { this->Run(); }
--- a/test/layernorm/test_layernorm_fp32.cpp
+++ b/test/layernorm/test_layernorm_fp32.cpp
@@ -2,28 +2,28 @@
 // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

 #include "gtest/gtest.h"
-#include "test_layernorm_util.hpp"
+#include "test_layernorm2d_util.hpp"

 template <ck::index_t N>
 using I = ck::Number<N>;

 template <typename Tuple>
-class TestLayernormFP32 : public ck::TestLayernorm<Tuple>
+class TestLayernorm2dFP32 : public ck::TestLayernorm2d<Tuple>
 {
 };

 // clang-format off
 using KernelTypes = ::testing::Types<
-//  XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
-    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
-    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
-    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
-    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
-    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
-    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
-    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>,
-    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<4>, I<4>, I<4>, I<4>>
+//  XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<8>, I<32>, I<2>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<4>, I<64>, I<2>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<2>, I<128>, I<2>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>,
+    std::tuple<float, float, float, float, float, I<2>, I<1>, I<256>, I<1>, I<256>, I<2>, I<8>, I<1>, I<4>, I<1>, I<4>, I<1>, I<4>, I<4>>
    >;
 // clang-format on
-TYPED_TEST_SUITE(TestLayernormFP32, KernelTypes);
-TYPED_TEST(TestLayernormFP32, Test_FP32) { this->Run(); }
+TYPED_TEST_SUITE(TestLayernorm2dFP32, KernelTypes);
+TYPED_TEST(TestLayernorm2dFP32, Test_FP32) { this->Run(); }
--- a/test/layernorm/test_layernorm_util.hpp
+++ b/test/layernorm/test_layernorm_util.hpp
@@ -31,7 +31,7 @@ std::string serialize_range(const Range& range)
 }

 template <typename Tuple>
-class TestLayernorm : public ::testing::Test
+class TestLayernorm2d : public ::testing::Test
 {
    protected:
    using XDataType                             = std::tuple_element_t<0, Tuple>;
@@ -48,9 +48,11 @@ class TestLayernorm : public ::testing::Test
    static constexpr index_t KThreadSliceSize   = std::tuple_element_t<11, Tuple>{}.value;
    static constexpr index_t XYSrcVectorDim     = std::tuple_element_t<12, Tuple>{}.value;
    static constexpr index_t XSrcVectorSize     = std::tuple_element_t<13, Tuple>{}.value;
-    static constexpr index_t GammaSrcVectorSize = std::tuple_element_t<14, Tuple>{}.value;
-    static constexpr index_t BetaSrcVectorSize  = std::tuple_element_t<15, Tuple>{}.value;
-    static constexpr index_t YDstVectorSize     = std::tuple_element_t<16, Tuple>{}.value;
+    static constexpr index_t GammaSrcVectorDim  = std::tuple_element_t<14, Tuple>{}.value;
+    static constexpr index_t GammaSrcVectorSize = std::tuple_element_t<15, Tuple>{}.value;
+    static constexpr index_t BetaSrcVectorDim   = std::tuple_element_t<16, Tuple>{}.value;
+    static constexpr index_t BetaSrcVectorSize  = std::tuple_element_t<17, Tuple>{}.value;
+    static constexpr index_t YDstVectorSize     = std::tuple_element_t<18, Tuple>{}.value;

    using PassThrough = ck::tensor_operation::element_wise::PassThrough;

@@ -78,23 +80,24 @@ class TestLayernorm : public ::testing::Test
                                                                         KThreadSliceSize,
                                                                         XYSrcVectorDim,
                                                                         XSrcVectorSize,
+                                                                         GammaSrcVectorDim,
                                                                         GammaSrcVectorSize,
+                                                                         BetaSrcVectorDim,
                                                                         BetaSrcVectorSize,
                                                                         YDstVectorSize>;

-    TestLayernorm() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}
+    TestLayernorm2d() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {}

-    void RunSingle(std::vector<index_t> lengths, std::vector<index_t> reduceDims)
+    void RunSingle(const std::vector<index_t>& lengths,
+                   const std::vector<index_t>& reduceDims,
+                   const std::vector<index_t>& GammaLength,
+                   const std::vector<index_t>& GammaStride,
+                   const std::vector<index_t>& BetaLength,
+                   const std::vector<index_t>& BetaStride)
    {
-        std::vector<index_t> reduceLength(reduceDims.size());
-        for(int i = 0; i < NumReduceDim; ++i)
-        {
-            reduceLength[i] = lengths[reduceDims[i]];
-        }
-
        Tensor<XDataType> x(lengths);
-        Tensor<GammaDataType> gamma(reduceLength);
-        Tensor<BetaDataType> beta(reduceLength);
+        Tensor<GammaDataType> gamma(GammaLength);
+        Tensor<BetaDataType> beta(BetaLength);
        Tensor<YDataType> y(lengths);
        Tensor<YDataType> y_ref(lengths);

@@ -115,10 +118,8 @@ class TestLayernorm : public ::testing::Test
        auto argument_ptr    = device_instance.MakeArgumentPointer(
            lengths,
            std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
-            std::vector<ck::index_t>{gamma.mDesc.GetStrides().begin(),
-                                     gamma.mDesc.GetStrides().end()},
-            std::vector<ck::index_t>{beta.mDesc.GetStrides().begin(),
-                                     beta.mDesc.GetStrides().end()},
+            GammaStride,
+            BetaStride,
            std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
            reduceDims,
            1e-4,
@@ -163,17 +164,16 @@ class TestLayernorm : public ::testing::Test

    void Run()
    {
-        for(auto length : this->lengths_)
+        std::vector<std::vector<index_t>> lengths = {
+            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
+
+        for(auto length : lengths)
        {
-            this->RunSingle(length, reduceDims_[0]);
+            this->RunSingle(length, {1}, {length[1]}, {0, 1}, {length[1]}, {0, 1});
        }
    }

-    std::vector<std::vector<index_t>> lengths_ = {
-        {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
-
-    std::vector<std::vector<index_t>> reduceDims_ = {{1}};
-
    typename ReferenceInstance::Invoker ref_instance_invoker_;
 };
+
 } // namespace ck